diff --git a/.github/workflows/create_stubs.yml b/.github/workflows/create_stubs.yml index e3883d0..2c8fa48 100644 --- a/.github/workflows/create_stubs.yml +++ b/.github/workflows/create_stubs.yml @@ -31,7 +31,10 @@ jobs: poetry install --only dev - name: Create Stubs run: | - poetry run python scripts/generate_polars_stubs.py + poetry run python scripts/generate_polars_stubs.py pr_body.md - name: Create Pull Request uses: peter-evans/create-pull-request@v5 + with: + body-path: pr_body.md + diff --git a/polugins_type_gen/poetry.lock b/polugins_type_gen/poetry.lock index a853c9d..a7d78f1 100644 --- a/polugins_type_gen/poetry.lock +++ b/polugins_type_gen/poetry.lock @@ -494,4 +494,4 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "044adb87c77d374c53a7dae2ce72438d5a8d97526dffef2dc5b1bd2da477ce81" +content-hash = "2886b73514d4597aba04c4a8324b99a19bb9c7a26291ab4b97d086efc5f45996" diff --git a/polugins_type_gen/pyproject.toml b/polugins_type_gen/pyproject.toml index 8d7fe54..dc77c6f 100644 --- a/polugins_type_gen/pyproject.toml +++ b/polugins_type_gen/pyproject.toml @@ -24,6 +24,7 @@ pytest-env = "^0.8.2" pytest-cov = "^4.1.0" example-package = {path = "tests/pkgs/example_package"} pytest-mock = "^3.12.0" +packaging = "^23.2" [tool.poetry.group.nox] optional = true diff --git a/polugins_type_gen/scripts/generate_polars_stubs.py b/polugins_type_gen/scripts/generate_polars_stubs.py index f6afaa9..ea455da 100644 --- a/polugins_type_gen/scripts/generate_polars_stubs.py +++ b/polugins_type_gen/scripts/generate_polars_stubs.py @@ -1,63 +1,64 @@ import subprocess import sys +from difflib import unified_diff +from itertools import pairwise from pathlib import Path - -from mypy.stubgen import Options as StubOptions -from mypy.stubgen import generate_stubs -from polugins._types import ExtensionClass - - -def generate_polars_stub(output_dir: Path): - modules = [".".join(enum.import_path.parts) for enum in ExtensionClass] - options = StubOptions( - inspect=True, - # 'module_file' contains the base class - # onto which dynamic methods are registered - files=[], - output_dir=str(output_dir), - include_private=True, - export_less=False, - # standard params (not auto-defaulted) - pyversion=sys.version_info[:2], - interpreter=sys.executable, - ignore_errors=False, - parse_only=False, - no_import=False, - search_path=["."], - doc_dir="", - packages=[], - modules=modules, - verbose=True, # TODO: change this, but nice for debugging now - quiet=False, - include_docstrings=True, +from tempfile import TemporaryDirectory + +from packaging import version + +IMPORT_PATHS = [ + Path("polars", "expr", "expr"), + Path("polars", "series", "series"), + Path("polars", "lazyframe", "frame"), + Path("polars", "dataframe", "frame"), +] + + +def run_stubgen( + version: str, no_docstring_stub_path: Path, stub_path: Path, tempdir_path: Path +) -> None: + venv_path = tempdir_path / f".venv{version}" + bin_path = venv_path / "bin" / "python" + subprocess.check_call([sys.executable, "-m", "venv", str(venv_path)]) + subprocess.check_call([bin_path, "-m", "pip", "install", f"polars=={version}", "mypy"]) + subprocess.check_call([bin_path, "scripts/stubgen.py", stub_path, "true"]) + subprocess.check_call( + [bin_path, "scripts/stubgen.py", tempdir_path / no_docstring_stub_path, "false"] ) - generate_stubs(options) -def install_polars(version: str): - subprocess.check_call([sys.executable, "-m", "pip", "install", f"polars=={version}"]) +def get_current_versions() -> set[version.Version]: + stub_dir = Path(__file__).parent.parent / "src" / "polugins_type_gen" / "_stubs" + return {version.parse(p.parts[-1]) for p in stub_dir.iterdir()} -def get_versions(): +def get_available_versions() -> set[version.Version]: res = subprocess.run( [sys.executable, "-m", "pip", "index", "versions", "polars"], capture_output=True, text=True, ) version_line_start = "Available versions: " + versions = None for line in res.stdout.splitlines(): if line.startswith(version_line_start): versions = line.split(version_line_start)[1].split(",") break - versions = [version.strip() for version in versions] - return versions + assert versions + return {version.parse(version_str.strip()) for version_str in versions} + +def get_missing_versions() -> set[version.Version]: + # 0.16.13 -> 0.16.14 changes location of imports + oldest_version = version.Version("0.16.13") + return {v for v in (get_available_versions() - get_current_versions()) if v > oldest_version} -def clean_types(path: Path): - extensions_class = ExtensionClass(path.parts[5]) + +def clean_types(path: Path, version): stub_content = path.read_text() - match extensions_class: - case ExtensionClass.DATAFRAME: + match path.parts[-2]: + case "dataframe": if (txt := "P: Incomplete") in stub_content: stub_content = ( "from typing_extensions import ParamSpec, Generic\n" @@ -66,7 +67,9 @@ def clean_types(path: Path): ) ) stub_content = stub_content.replace("_df: Incomplete", "_df: PyDataFrame") - case ExtensionClass.LAZYFRAME: + stub_content = stub_content.replace("columns: Incomplete", "columns: list[str]") + + case "lazyframe": if (txt := "P: Incomplete") in stub_content: stub_content = ( "from typing_extensions import ParamSpec, Generic\n" @@ -75,7 +78,7 @@ def clean_types(path: Path): ) ) stub_content = stub_content.replace("_ldf: Incomplete", "_ldf: PyLazyFrame") - case ExtensionClass.EXPR: + case "expr": if (txt := "P: Incomplete") in stub_content: stub_content = ( "from typing_extensions import ParamSpec, Generic\n" @@ -83,7 +86,7 @@ def clean_types(path: Path): "class Expr", "class Expr(Generic[P])" ) ) - case ExtensionClass.SERIES: + case "series": array_like = ( 'ArrayLike = Union[Sequence[Any], "Series", ' '"pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"]' @@ -94,32 +97,91 @@ def clean_types(path: Path): stub_content = stub_content.replace("len: Incomplete", "len: int").replace( "s: Incomplete", "s: Series" ) + case err: + raise ValueError(err) stub_content = stub_content.replace("from _typeshed import Incomplete", "") - - path.with_suffix("").write_text(stub_content) + stub_content = f"#: version {version}\n" + stub_content + return stub_content def is_incomplete(path: Path): - assert "Incomplete" in path.read_text() + return "Incomplete" in path.read_text() + + +def main(tmp_dir: Path): + versions = get_missing_versions() + print(f"Missing versions: {versions}") + for version_ in versions: + output_dir = Path("src", "polugins_type_gen", "_stubs", str(version_)) + no_docstring_output_dir = Path("no_docstring", str(version_)) + output_dir.mkdir(parents=True) + run_stubgen(str(version_), no_docstring_output_dir, output_dir, tmp_dir) + for import_path in IMPORT_PATHS: + stub_path = output_dir / import_path.with_suffix(".pyi") + cleaned_stub_content = clean_types(stub_path, version_) + stub_path.with_suffix(".pyi").write_text(cleaned_stub_content) + if is_incomplete(stub_path): + msg = f"File {stub_path} could not be cleaned and has Incomplete types." + raise ValueError(msg) + return versions -def main(force: bool = False): - versions = get_versions() - for version in versions: - # 0.16.13 -> 0.16.14 changes location of imports - if version == "0.16.13": - break - output_dir = Path("src", "polugins_type_gen", "_stubs", version) - if output_dir.exists() and not force: - continue - output_dir.mkdir(parents=True, exist_ok=True) - install_polars(version) - generate_polars_stub(output_dir) - for extension_class in ExtensionClass: - stub_path = output_dir / extension_class.import_path.with_suffix(".pyi") - clean_types(stub_path) - stub_path.unlink() +def diff_chunk(content: str): + return f"```diff\n{content}\n```\n" + + +def comparison_section( + version_1: version.Version, version_2: version.Version, comparisons: list[tuple[str, str]] +): + header = f"# Changes from {version_1} to {version_2}\n" + body = "" + for extension_class, diff in comparisons: + body += f"## {extension_class}\n{diff_chunk(diff)}" + return header + body + + +def create_pr_body(versions: set[version.Version]): + current_versions = get_current_versions() + newest_current_version = max(current_versions - versions) + + comparisons = { + (version_1, version_2): compare_versions(version_1, version_2) + for version_1, version_2 in pairwise(sorted(versions.union([newest_current_version]))) + } + header = "# Automatic stub gen\n Changes between new versions and last:\n" + body = "\n".join( + comparison_section(version_1, version_2, comparison) + for (version_1, version_2), comparison in comparisons.items() + ) + return header + body + + +def compare_versions(version_1: version.Version, version_2) -> list[tuple[str, str]]: + results = [] + stub_dir_1 = Path("no_docstring", str(version_1)) + stub_dir_2 = Path("no_docstring", str(version_2)) + for extension_class in IMPORT_PATHS: + stub_path1 = stub_dir_1 / extension_class.with_suffix(".pyi") + stub_path2 = stub_dir_2 / extension_class.with_suffix(".pyi") + result = "\n".join( + unified_diff( + stub_path1.read_text().splitlines(), + stub_path2.read_text().splitlines(), + fromfile=str(version_1), + tofile=str(version_2), + ) + ) + results.append((extension_class.parts[-2], result)) + return results if __name__ == "__main__": - main() + with TemporaryDirectory() as tempdir: + tempdir_path = Path(tempdir) + new_versions = main(tempdir_path) + body_content = create_pr_body(new_versions) + + body_path = Path(sys.argv[1]) if len(sys.argv) > 1 else tempdir_path / "pr_body.md" + print(body_path) + + body_path.write_text(body_content) diff --git a/polugins_type_gen/scripts/stubgen.py b/polugins_type_gen/scripts/stubgen.py new file mode 100644 index 0000000..515f0de --- /dev/null +++ b/polugins_type_gen/scripts/stubgen.py @@ -0,0 +1,41 @@ +import sys +from pathlib import Path + +from mypy.stubgen import Options as StubOptions +from mypy.stubgen import generate_stubs + +IMPORT_PATHS = [ + Path("polars", "expr", "expr"), + Path("polars", "series", "series"), + Path("polars", "lazyframe", "frame"), + Path("polars", "dataframe", "frame"), +] + + +if __name__ == "__main__": + output_dir = sys.argv[1] + modules = [".".join(import_path.parts) for import_path in IMPORT_PATHS] + include_docstrings = sys.argv[2] == "true" + options = StubOptions( + inspect=True, + # 'module_file' contains the base class + # onto which dynamic methods are registered + files=[], + output_dir=output_dir, + include_private=True, + export_less=False, + # standard params (not auto-defaulted) + pyversion=sys.version_info[:2], + interpreter=sys.executable, + ignore_errors=False, + parse_only=False, + no_import=False, + search_path=["."], + doc_dir="", + packages=[], + modules=modules, + verbose=True, # TODO: change this, but nice for debugging now + quiet=False, + include_docstrings=include_docstrings, + ) + generate_stubs(options) diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/dataframe/frame deleted file mode 100644 index 989f05b..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/dataframe/frame +++ /dev/null @@ -1,274 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from io import BytesIO, IOBase -from pathlib import Path -from polars import internals as pli -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturned as NoRowsReturned, TooManyRowsReturned as TooManyRowsReturned -from polars.internals.construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.internals.io_excel import ColumnTotalsDefinition as ColumnTotalsDefinition, ConditionalFormatDict as ConditionalFormatDict, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.series.series import Series as Series -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -def wrap_df(df: PyDataFrame) -> DataFrame: ... - -class DataFrame(Generic[P]): - _accessors: set[str] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], infer_schema_length: int | None = ..., schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | list[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @columns.setter - def columns(self, columns: Sequence[str]) -> None: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: ... - def _div(self, other: Any, floordiv: bool) -> Self: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int]) -> Series: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, str]) -> Series: ... - @overload - def __getitem__(self, item: tuple[int, int]) -> Any: ... - @overload - def __getitem__(self, item: tuple[int, str]) -> Any: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - @overload - def write_json(self, file: None = ..., pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | str | Path, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, include_header: bool = ..., header_name: str = ..., column_names: Iterator[str] | Sequence[str] | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: pli.Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: ... - @overload - def glimpse(self, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, return_as_string: Literal[True]) -> str: ... - def describe(self) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def frame_equal(self, other: DataFrame, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_col: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def pipe(self, func: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy[Self]: ... - def groupby_rolling(self, index_column: str, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> RollingGroupBy[Self]: ... - def groupby_dynamic(self, index_column: str, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> DynamicGroupBy[Self]: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, left_on: str | None | pli.Expr = ..., right_on: str | None | pli.Expr = ..., on: str | None | pli.Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: DataFrame, left_on: str | pli.Expr | Sequence[str | pli.Expr] | None = ..., right_on: str | pli.Expr | Sequence[str | pli.Expr] | None = ..., on: str | pli.Expr | Sequence[str | pli.Expr] | None = ..., how: JoinStrategy = ..., suffix: str = ...) -> Self: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., inference_size: int = ...) -> Self: ... - def hstack(self, columns: list[Series] | DataFrame, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, fill_value: pli.Expr | int | float | None) -> Self: ... - def explode(self, columns: str | Sequence[str] | pli.Expr | Sequence[pli.Expr], *more_columns: str | pli.Expr) -> Self: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | pli.Expr = ..., maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, periods: int, fill_value: int | str | float) -> Self: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> pli.LazyFrame: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ...) -> Self: ... - def unique(self, maintain_order: bool = ..., subset: str | Sequence[str] | None = ..., keep: UniqueKeepStrategy = ...) -> Self: ... - def n_unique(self, subset: str | pli.Expr | Sequence[str | pli.Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: pli.Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: pli.Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, named: Literal[True]) -> list[dict[str, Any]]: ... - @overload - def iter_rows(self, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> Self: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs): ... - def merge_sorted(self, other: DataFrame, key: str) -> Self: ... - def update(self, other: DataFrame, on: None | str | Sequence[str] = ..., how: str = ...) -> Self: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/dataframe/frame.pyi new file mode 100644 index 0000000..e3540f4 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/dataframe/frame.pyi @@ -0,0 +1,5253 @@ +#: version 0.16.14 +import P +import np as np +import pa as pa +import pd as pd +import pli +from _io import BytesIO + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Object as Object, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturned as NoRowsReturned, TooManyRowsReturned as TooManyRowsReturned +from polars.internals.construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.internals.io_excel import _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +FLOAT_DTYPES: frozenset +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool +def wrap_df(df: PyDataFrame) -> DataFrame: ... + +class DataFrame: + _accessors: ClassVar[set] = ... + columns: list[str] + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], infer_schema_length: int | None = ..., schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, *args, **kwargs) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, *args, **kwargs) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, *args, **kwargs) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, *args, **kwargs) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | list[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> Self: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> Self: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> Self: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int] | tuple[MultiRowSelector, str] | tuple[int, int] | tuple[int, str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self) -> Any: + ''' + Return the dataframe as a scalar. + + Equivalent to ``df[0,0]``, with a check that the shape is (1,1). + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> result = df.select((pl.col("a") * pl.col("b")).sum()) + >>> result + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 32 │ + └─────┘ + >>> result.item() + 32 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> numpy_array = df.to_numpy() + >>> type(numpy_array) + + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. Valid names are: + "average", "count_nums", "count", "max", "min", "std_dev", "sum", "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + + Notes + ----- + Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, see: + https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should be adjacent to each other. + Two other polars-specific keys are available to help define where the sparkline + appears in the table: "insert_after", and "insert_before". The value associated + with these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... hide_gridlines=True, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to append to or create in the SQL database. + connection_uri + Connection uri, for example + + * "postgresql://username:password@server:port/database" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional generator/iterator that yields column names. Will be used to + replace the columns in the DataFrame. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: pli.Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self) -> Self: + ''' + Summary statistics for a DataFrame. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (7, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, *args, **kwargs) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_col: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_col + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, func: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + func + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: str) -> RollingGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic* + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime) + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: str) -> DynamicGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + - \'window\': Truncate the start of the window with the \'every\' argument. + - \'datapoint\': Start from the first encountered data point. + - \'monday\': Start the window on the monday before the first data point. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... low=datetime(2021, 12, 16), + ... high=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... low=datetime(2021, 12, 16), + ... high=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ) + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ) + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ) + >>> population.join_asof( + ... gdp, left_on="date", right_on="date", strategy="backward" + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame) -> Self: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + It is better to implement this with an expression: + + >>> df.select([pl.col("foo") * 2, pl.col("bar") * 3]) # doctest: +IGNORE_RESULT + + Return a Series by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, df: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + df + DataFrame to stack. + in_place + Modify in place + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: Self) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this + `DataFrame` `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. + For instance during online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a + query. For instance when you read in multiple files and when to store them in a + single `DataFrame`. In the latter case, finish the sequence of `vstack` + operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current DataFrame, with zero to \'n\' rows. + + Returns a DataFrame with identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, fill_value: pli.Expr | int | float | None) -> Self: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + fill_value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | pli.Expr | Sequence[pli.Expr], *more_columns: str | pli.Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | pli.Expr = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function : {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + A predefined aggregate function str or an expression. + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar") + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.arange(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ … ┆ … │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, *args, **kwargs) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, periods: int, fill_value: int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + fill_value + fill None values with this value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> pli.LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + Degrees of freedom + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + Degrees of freedom + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'none\'} + Which of the duplicate rows to keep. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"]) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last") + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | pli.Expr | Sequence[str | pli.Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `frac`. Defaults to 1 if + `frac` is None. + frac + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturned``, and + zero rows will raise ``NoRowsReturned`` (both inherit from ``RowsException``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you absolutely + require row-iteration you should strongly prefer ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialises all frame data as a list of rows. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.rows() + [(1, 2), (3, 4), (5, 6)] + >>> df.rows(named=True) + [{\'a\': 1, \'b\': 2}, {\'a\': 3, \'b\': 4}, {\'a\': 5, \'b\': 6}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + An iterator of tuples (default) or dictionaries of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, *args, **kwargs) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> Self: + ''' + Return Pearson product-moment correlation coefficients. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + kwargs + keyword arguments are passed to numpy corrcoef + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def update(self, other: DataFrame, on: None | str | Sequence[str] = ..., how: str = ...) -> Self: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'Left\' will keep the left table rows as is. + \'Inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/expr/expr deleted file mode 100644 index 908acf3..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/expr/expr +++ /dev/null @@ -1,243 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import date, datetime, time, timedelta -from polars.dataframe.frame import DataFrame as DataFrame -from polars.datatypes import Struct as Struct, UInt32 as UInt32, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.lazyframe.frame import LazyFrame as LazyFrame -from polars.polars import PyExpr as PyExpr -from polars.series.series import Series as Series -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -def selection_to_pyexpr_list(exprs: IntoExpr | Iterable[IntoExpr], structify: bool = ...) -> list[PyExpr]: ... -def expr_output_name(expr: Expr) -> str | None: ... -def expr_to_lit_or_expr(expr: IntoExpr | Iterable[IntoExpr], str_to_lit: bool = ..., structify: bool = ..., name: str | None = ...) -> Expr: ... -def wrap_expr(pyexpr: PyExpr) -> Expr: ... - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: set[str] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __invert__(self) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __and__(self, other: Expr) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __or__(self, other: Expr) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __ge__(self, other: Any) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __pos__(self) -> Expr: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: Expr, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, reverse: bool = ...) -> Self: ... - def cumprod(self, reverse: bool = ...) -> Self: ... - def cummin(self, reverse: bool = ...) -> Self: ... - def cummax(self, reverse: bool = ...) -> Self: ... - def cumcount(self, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], strict: bool = ...) -> Self: ... - def sort(self, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ..., descending: bool = ...) -> Self: ... - def arg_sort(self, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, periods: int, fill_value: int | float | bool | str | Expr | list[Any]) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, fill_value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., skip_nulls: bool = ..., pass_name: bool = ..., *, strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def is_in(self, other: Expr | Sequence[Any] | str | Series) -> Self: ... - def repeat_by(self, by: Expr | str) -> Self: ... - def is_between(self, start: Expr | datetime | date | time | int | float | str, end: Expr | datetime | date | time | int | float | str, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def argsort(self, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def rank(self, method: RankMethod = ..., descending: bool = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, bias: bool = ...) -> Self: ... - def kurtosis(self, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, min_val: int | float, max_val: int | float) -> Self: ... - def clip_min(self, min_val: int | float) -> Self: ... - def clip_max(self, max_val: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def reshape(self, dims: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def entropy(self, base: float = ..., normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., parallel: bool = ...) -> Self: ... - def set_sorted(self, descending: bool = ...) -> Self: ... - def list(self) -> Self: ... - def shrink_dtype(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ...) -> Self: ... - @property - def arr(self) -> ExprListNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/expr/expr.pyi new file mode 100644 index 0000000..fa508ef --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/expr/expr.pyi @@ -0,0 +1,5467 @@ +#: version 0.16.14 +import P +import np as np +from datetime import date, datetime, time, timedelta +from polars.datatypes.classes import Struct as Struct, UInt32 as UInt32 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +py_arg_where: builtin_function_or_method +def selection_to_pyexpr_list(exprs: IntoExpr | Iterable[IntoExpr], structify: bool = ...) -> list[PyExpr]: ... +def expr_output_name(expr: Expr) -> str | None: ... +def expr_to_lit_or_expr(expr: IntoExpr | Iterable[IntoExpr], str_to_lit: bool = ..., structify: bool = ..., name: str | None = ...) -> Expr: + ''' + Convert args to expressions. + + Parameters + ---------- + expr + Any argument. + str_to_lit + If True string argument `"foo"` will be converted to `lit("foo")`. + If False it will be converted to `col("foo")`. + structify + If the final unaliased expression has multiple output names, + automatically convert it to struct. + name + Apply the given name as an alias to the resulting expression. + + Returns + ------- + Expr + + ''' +def wrap_expr(pyexpr: PyExpr) -> Expr: ... + +class Expr: + _pyexpr: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __invert__(self) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __and__(self, other: Expr) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __or__(self, other: Expr) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __ge__(self, other: Any) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def ge(self, other: Any) -> Self: ... + def le(self, other: Any) -> Self: ... + def eq(self, other: Any) -> Self: ... + def ne(self, other: Any) -> Self: ... + def lt(self, other: Any) -> Self: ... + def gt(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __pos__(self) -> Expr: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def all(self) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + pl.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the output of an expression. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + >>> df.select( + ... [ + ... pl.col("a").alias("bar"), + ... pl.col("b").alias("foo"), + ... ] + ... ) + shape: (3, 2) + ┌─────┬──────┐ + │ bar ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + + Keep original column name to undo an alias operation. + + >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent + "DuplicateError: Column with name: \'literal\' has more than one occurrences" + errors. + + >>> df.select([(pl.lit(10) / pl.all()).keep_name()]) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr): + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + ... + >>> df = pl.DataFrame({"a": ["a: 1", "b: 2", "c: 3"]}) + >>> df.with_columns(pl.col("a").pipe(extract_number)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().prefix("reverse_"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def map_alias(self, *args, **kwargs) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps root name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [3, 4], + ... } + ... ) + >>> df.select( + ... pl.all().reverse().map_alias(lambda colName: colName + "_reverse") + ... ) + shape: (2, 2) + ┌───────────┬───────────┐ + │ A_reverse ┆ B_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════════╪═══════════╡ + │ 2 ┆ 4 │ + │ 1 ┆ 3 │ + └───────────┴───────────┘ + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: Expr, upcast: bool = ...) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └─────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any], strict: bool = ...) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, *args, **kwargs) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + descending + Return the smallest elements. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").top_k(descending=True).alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, *args, **kwargs) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, periods: int, fill_value: int | float | bool | str | Expr | list[Any]) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + fill_value + Fill None values with the result of this expression. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill(1, "a")) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.fill_null(strategy="zero") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ 0 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(99) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ 99 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(strategy="forward") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def fill_nan(self, fill_value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.fill_nan("zero") + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪══════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ zero │ + │ zero ┆ 6.0 │ + └──────┴──────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 6 │ + │ null ┆ 6 │ + └──────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + Degrees of freedom. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + Degrees of freedom. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self, maintain_order: bool = ...) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, polars will assume that + the dtype remains unchanged. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considdered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + .. deprecated:: 0.15.16 + `Expr.explode` will be removed in favour of `Expr.arr.explode` and + `Expr.str.explode`. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + ExprStringNameSpace.explode : Explode a string column. + + """ + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`Expr.head`. + + Parameters + ---------- + n + Number of rows to return. + + """ + def pow(self, exponent: int | float | Series | Expr) -> Self: + ''' + Raise expression to the power of exponent. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").pow(3)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ 8.0 │ + │ 27.0 │ + │ 64.0 │ + └──────┘ + + ''' + def is_in(self, other: Expr | Sequence[Any] | str | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: Expr | str) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, start: Expr | datetime | date | time | int | float | str, end: Expr | datetime | date | time | int | float | str, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + start + Lower bound value (can be an expression or literal). + end + Upper bound value (can be an expression or literal). + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self, signed: bool = ...) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill nulls with linear interpolation over missing values. + + Can also be used to regrid data to a new grid - see examples below. + + Parameters + ---------- + method : {\'linear\', \'linear\'} + Interpolation method + + Examples + -------- + >>> # Fill nulls with linear interpolation + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_min(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + └──────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_max(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 8.0, 6.0, 2.0, 16.0, 10.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_mean(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.5 │ + │ 7.0 │ + │ 4.0 │ + │ 9.0 │ + │ 13.0 │ + └──────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_sum(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 3.0 │ + │ 5.0 │ + │ 7.0 │ + │ 9.0 │ + │ 11.0 │ + └──────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Compute a rolling standard deviation. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_std(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 1.527525 │ + │ 2.0 │ + └──────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_var(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 2.333333 │ + │ 4.0 │ + └──────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_median(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_quantile(quantile=0.33, window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + └──────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int, bias: bool = ...) -> Self: + """ + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + """ + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def argsort(self, *args, **kwargs) -> Self: + ''' + Get the index values that would sort this column. + + Alias for :func:`Expr.arg_sort`. + + .. deprecated:: 0.16.5 + `Expr.argsort` will be removed in favour of `Expr.arg_sort`. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").argsort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def rank(self, *args, **kwargs) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self, bias: bool = ...) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self, fisher: bool = ..., bias: bool = ...) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, min_val: int | float, max_val: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + min_val + Minimum value. + max_val + Maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, min_val: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + min_val + Minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, max_val: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + max_val + Maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def reshape(self, dims: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dims + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `frac`. Defaults to 1 if + `frac` is None. + frac + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(frac=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self, multithreaded: bool = ..., sort: bool = ...) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def entropy(self, base: float = ..., normalize: bool = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ..., parallel: bool = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self, *args, **kwargs) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def list(self) -> Self: + ''' + Aggregate to list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().list()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + ''' + @property + def arr(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def bin(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/lazyframe/frame deleted file mode 100644 index 576e104..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/lazyframe/frame +++ /dev/null @@ -1,124 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars.dataframe.frame import DataFrame as DataFrame -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.expr.expr import Expr as Expr -from polars.internals import selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.series.series import Series as Series -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, PolarsExprType as PolarsExprType, PythonLiteral as PythonLiteral, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath -from typing import Any, Callable, Concatenate, Iterable, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -def wrap_ldf(ldf: PyLazyFrame) -> LazyFrame: ... - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: set[str] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - @classmethod - def _scan_csv(cls, source: str, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | list[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, func: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_plan(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_optimized_plan(self, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, optimized: bool = ..., *, show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def profile(self, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy[Self]: ... - def groupby_rolling(self, index_column: str, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> LazyGroupBy[Self]: ... - def groupby_dynamic(self, index_column: str, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> LazyGroupBy[Self]: ... - def join_asof(self, other: LazyFrame, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., suffix: str = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, exprs: str | PolarsExprType | PythonLiteral | Series | Iterable[str | PolarsExprType | PythonLiteral | Series | None] | None = ..., *more_exprs: str | PolarsExprType | PythonLiteral | Series | None, **named_exprs: str | PolarsExprType | PythonLiteral | Series | None) -> Self: ... - def with_context(self, other): ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, periods: int, fill_value: Expr | int | str | float) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, fill_value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, maintain_order: bool = ..., subset: str | Sequence[str] | None = ..., keep: UniqueKeepStrategy = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def update(self, other: LazyFrame, on: None | str | Sequence[str] = ..., how: str = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..fed4189 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/lazyframe/frame.pyi @@ -0,0 +1,3100 @@ +#: version 0.16.14 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.expr.expr import selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int +def wrap_ldf(ldf: PyLazyFrame) -> LazyFrame: ... + +class LazyFrame: + _accessors: ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | list[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> Self: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, func: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + func + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def describe_plan(self) -> str: + ''' + Create a string representation of the unoptimized query plan. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + .. deprecated:: 0.16.10 + Use ``LazyFrame.explain`` + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).describe_plan() # doctest: +SKIP + + ''' + def describe_optimized_plan(self) -> str: + """Create a string representation of the optimized query plan.""" + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, *args, **kwargs) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: str) -> LazyGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime) + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: str) -> LazyGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + * \'datapoint\': Start from the first encountered data point. + * \'monday\': Start the window on the monday before the first data point. + + See Also + -------- + groupby_rolling + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... low=datetime(2021, 12, 16), + ... high=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... low=datetime(2021, 12, 16), + ... high=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ) + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ) + >>> population.join_asof( + ... gdp, left_on="date", right_on="date", strategy="backward" + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, exprs: str | PolarsExprType | PythonLiteral | Series | Iterable[str | PolarsExprType | PythonLiteral | Series | None] | None = ..., *more_exprs: str | PolarsExprType | PythonLiteral | Series | None, **named_exprs: str | PolarsExprType | PythonLiteral | Series | None) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, periods: int, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + fill_value + fill None values with the result of this expression. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(periods=1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, fill_value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + fill_value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their standard deviation value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their variance value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'none\'} + Which of the duplicate rows to keep. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique().collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"]).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last").collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible ot running in the streaming + engine. That means that the function must produce the same result if it + is exectuted on batches as it would when executed on the full dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, *args, **kwargs) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def update(self, other: LazyFrame, on: None | str | Sequence[str] = ..., how: str = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'Left\' will keep the left table rows as is. + \'Inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/series/series deleted file mode 100644 index 4e9c7f3..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/series/series +++ /dev/null @@ -1,299 +0,0 @@ - -from datetime import date, datetime, time, timedelta -from polars.dataframe.frame import DataFrame as DataFrame -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.expr.expr import Expr as Expr -from polars.internals.construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -def wrap_s(s: PySeries) -> Series: ... - -class Series: - _s: PySeries - _accessors: set[str] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex, nan_to_null: bool = ...) -> Self: ... - @classmethod - def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Any) -> Self: ... - def __sub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __floordiv__(self, other): ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, power: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __iter__(self) -> SeriesIter: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def min(self) -> int | float | date | datetime | timedelta | time | str | None: ... - def max(self) -> int | float | date | datetime | timedelta | time | str | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ...) -> DataFrame: ... - def value_counts(self, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, in_place: bool = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, reverse: bool = ...) -> Series: ... - def cummin(self, reverse: bool = ...) -> Series: ... - def cumprod(self, reverse: bool = ...) -> Series: ... - def cumsum(self, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, descending: bool = ..., *, in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ..., descending: bool = ...) -> Series: ... - def arg_sort(self, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def argsort(self, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Sequence[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, start: Expr | datetime | date | time | int | float | str, end: Expr | datetime | date | time | int | float | str, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, fill_value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, periods: int, fill_value: int | Expr) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., descending: bool = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, bias: bool = ...) -> float | None: ... - def kurtosis(self, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, min_val: int | float, max_val: int | float) -> Series: ... - def clip_min(self, min_val: int | float) -> Series: ... - def clip_max(self, max_val: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ...) -> Self: ... - def reshape(self, dims: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - @property - def arr(self) -> ListNameSpace: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -class SeriesIter: - len: int - i: int - s: Series - def __init__(self, length: int, s: Series) -> None: ... - def __iter__(self) -> SeriesIter: ... - def __next__(self) -> Any: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/series/series.pyi new file mode 100644 index 0000000..7e9a92b --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.14/polars/series/series.pyi @@ -0,0 +1,3819 @@ +#: version 0.16.14 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, time, timedelta +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.internals.construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar, NoReturn, Sequence + +TYPE_CHECKING: bool +_PYARROW_AVAILABLE: bool +def wrap_s(s: PySeries) -> Series: ... + +class Series: + _s: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array, rechunk: bool = ...) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, *args, **kwargs) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + @classmethod + def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def le(self, other: Any) -> Self: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self: + """Method equivalent of operator expression ``series == other``.""" + def ne(self, other: Any) -> Self: + """Method equivalent of operator expression ``series != other``.""" + def ge(self, other: Any) -> Self: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame: ... + def __sub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Series: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame: ... + def __mod__(self, other: Any) -> Series: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, power: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __iter__(self) -> SeriesIter: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self) -> Any: + ''' + Return the series as a scalar. + + Equivalent to ``s[0]``, with a check that the shape is (1,). + + Examples + -------- + >>> s = pl.Series("a", [1]) + >>> s.item() + 1 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def any(self) -> bool: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self) -> bool: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (6, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ count ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ unique ┆ 4 │ + │ null_count ┆ 1 │ + │ count ┆ 5 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def min(self) -> int | float | date | datetime | timedelta | time | str | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> int | float | date | datetime | timedelta | time | str | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ...) -> DataFrame: + ''' + Bin values into discrete values. + + Parameters + ---------- + bins + Bins to create. + labels + Labels to assign to the bins. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut(bins=[-1, 1]) + shape: (12, 3) + ┌──────┬─────────────┬──────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪══════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ + │ 1.5 ┆ inf ┆ (1.0, inf] │ + │ 2.0 ┆ inf ┆ (1.0, inf] │ + │ 2.5 ┆ inf ┆ (1.0, inf] │ + └──────┴─────────────┴──────────────┘ + + ''' + def value_counts(self, sort: bool = ...) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ..., normalize: bool = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ..., parallel: bool = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Return a copy of the Series with a new alias/name. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> srs = pl.Series("x", [1, 2, 3]) + >>> new_aliased_srs = srs.alias("y") + + ''' + def rename(self, name: str, in_place: bool = ...) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self, reverse: bool = ...) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self, reverse: bool = ...) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self, reverse: bool = ...) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self, reverse: bool = ...) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series, append_chunks: bool = ...) -> Series: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.append(s2) + shape: (6,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self) -> Series: + """ + Return the `k` largest elements. + + If 'descending=True` the smallest elements will be given. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + descending + Return the smallest elements. + + """ + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def argsort(self, *args, **kwargs) -> Series: + """ + Get the index values that would sort this Series. + + Alias for :func:`Series.arg_sort`. + + .. deprecated:: 0.16.5 + `Series.argsort` will be removed in favour of `Series.arg_sort`. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + """ + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self, maintain_order: bool = ...) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self, *args, **kwargs) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Sequence[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + .. deprecated:: 0.15.16 + `Series.explode` will be removed in favour of `Series.arr.explode` and + `Series.str.explode`. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ListNameSpace.explode : Explode a list column. + StringNameSpace.explode : Explode a string column. + + """ + def series_equal(self, other: Series, null_equal: bool = ..., strict: bool = ...) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self, use_pyarrow: bool = ...) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, start: Expr | datetime | date | time | int | float | str, end: Expr | datetime | date | time | int | float | str, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + start + Lower bound value (can be an expression or literal). + end + Upper bound value (can be an expression or literal). + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self, ignore_nulls: bool = ...) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use pyarrow for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, fill_value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + fill_value + Value used to fill nan values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, periods: int, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + fill_value + Fill None values with the result of this expression. + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int, bias: bool = ...) -> Series: + ''' + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_skew(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 0.0 + 0.0 + 0.381802 + 0.0 + ] + + ''' + def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `frac`. Defaults to 1 if + `frac` is None. + frac + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self, in_place: bool = ...) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self, signed: bool = ...) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self, bias: bool = ...) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self, fisher: bool = ..., bias: bool = ...) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, min_val: int | float, max_val: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + min_val + Minimum value. + max_val + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, min_val: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + min_val + Minimum value. + + ''' + def clip_max(self, max_val: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + max_val + Maximum value. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ''' + def reshape(self, dims: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dims + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def str(self): ... + @property + def struct(self): ... + +class SeriesIter: + def __init__(self, length: int, s: Series) -> None: ... + def __iter__(self) -> SeriesIter: ... + def __next__(self) -> Any: ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/dataframe/frame deleted file mode 100644 index 3bdee90..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/dataframe/frame +++ /dev/null @@ -1,278 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from io import BytesIO, IOBase -from pathlib import Path -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturned as NoRowsReturned, TooManyRowsReturned as TooManyRowsReturned -from polars.expr import Expr as Expr -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.lazyframe import LazyFrame as LazyFrame -from polars.polars import PyDataFrame as PyDataFrame -from polars.series import Series as Series -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr -from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: set[str] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], infer_schema_length: int | None = ..., schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @columns.setter - def columns(self, columns: Sequence[str]) -> None: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: ... - def _div(self, other: Any, floordiv: bool) -> Self: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int]) -> Series: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, str]) -> Series: ... - @overload - def __getitem__(self, item: tuple[int, int]) -> Any: ... - @overload - def __getitem__(self, item: tuple[int, str]) -> Any: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - @overload - def write_json(self, file: None = ..., pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | str | Path, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, include_header: bool = ..., header_name: str = ..., column_names: Iterator[str] | Sequence[str] | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: ... - @overload - def glimpse(self, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, return_as_string: Literal[True]) -> str: ... - def describe(self) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def frame_equal(self, other: DataFrame, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_col: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def pipe(self, func: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy[Self]: ... - def groupby_rolling(self, index_column: str, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> RollingGroupBy[Self]: ... - def groupby_dynamic(self, index_column: str, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> DynamicGroupBy[Self]: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: DataFrame, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., suffix: str = ...) -> Self: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., inference_size: int = ...) -> Self: ... - def hstack(self, columns: list[Series] | DataFrame, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, fill_value: Expr | int | float | None) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr = ..., maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, periods: int, fill_value: int | str | float) -> Self: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ...) -> Self: ... - def unique(self, maintain_order: bool = ..., subset: str | Sequence[str] | None = ..., keep: UniqueKeepStrategy = ...) -> Self: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, named: Literal[True]) -> list[dict[str, Any]]: ... - @overload - def iter_rows(self, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> Self: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs): ... - def merge_sorted(self, other: DataFrame, key: str) -> Self: ... - def update(self, other: DataFrame, on: None | str | Sequence[str] = ..., how: str = ...) -> Self: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/dataframe/frame.pyi new file mode 100644 index 0000000..f6bac34 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/dataframe/frame.pyi @@ -0,0 +1,5290 @@ +#: version 0.16.15 +import P +import np as np +import pa as pa +import pd as pd +from _io import BytesIO + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Object as Object, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturned as NoRowsReturned, TooManyRowsReturned as TooManyRowsReturned +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr +from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +FLOAT_DTYPES: frozenset +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: ClassVar[set] = ... + columns: list[str] + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], infer_schema_length: int | None = ..., schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, *args, **kwargs) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, *args, **kwargs) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, *args, **kwargs) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, *args, **kwargs) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> Self: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> Self: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> Self: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int] | tuple[MultiRowSelector, str] | tuple[int, int] | tuple[int, str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self) -> Any: + ''' + Return the dataframe as a scalar. + + Equivalent to ``df[0,0]``, with a check that the shape is (1,1). + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> result = df.select((pl.col("a") * pl.col("b")).sum()) + >>> result + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 32 │ + └─────┘ + >>> result.item() + 32 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> numpy_array = df.to_numpy() + >>> type(numpy_array) + + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. Valid names are: + "average", "count_nums", "count", "max", "min", "std_dev", "sum", "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + + Notes + ----- + Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, see: + https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should be adjacent to each other. + Two other polars-specific keys are available to help define where the sparkline + appears in the table: "insert_after", and "insert_before". The value associated + with these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to append to or create in the SQL database. + connection_uri + Connection uri, for example + + * "postgresql://username:password@server:port/database" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional generator/iterator that yields column names. Will be used to + replace the columns in the DataFrame. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self) -> Self: + ''' + Summary statistics for a DataFrame. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (7, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, *args, **kwargs) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + """ + Return the `k` largest elements. + + If 'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the 'k' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + """ + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_col: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_col + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, func: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + func + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: str) -> RollingGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic* + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime) + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: str) -> DynamicGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + - \'window\': Truncate the start of the window with the \'every\' argument. + - \'datapoint\': Start from the first encountered data point. + - \'monday\': Start the window on the monday before the first data point. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... low=datetime(2021, 12, 16), + ... high=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... low=datetime(2021, 12, 16), + ... high=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ) + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ) + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ) + >>> population.join_asof( + ... gdp, left_on="date", right_on="date", strategy="backward" + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame) -> Self: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + It is better to implement this with an expression: + + >>> df.select([pl.col("foo") * 2, pl.col("bar") * 3]) # doctest: +IGNORE_RESULT + + Return a Series by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, df: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + df + DataFrame to stack. + in_place + Modify in place + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: Self) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this + `DataFrame` `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. + For instance during online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a + query. For instance when you read in multiple files and when to store them in a + single `DataFrame`. In the latter case, finish the sequence of `vstack` + operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current DataFrame, with zero to \'n\' rows. + + Returns a DataFrame with identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, fill_value: Expr | int | float | None) -> Self: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + fill_value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function : {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + A predefined aggregate function str or an expression. + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar") + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.arange(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ … ┆ … │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, *args, **kwargs) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, periods: int, fill_value: int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + fill_value + fill None values with this value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + Degrees of freedom + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + Degrees of freedom + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"]) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last") + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `frac`. Defaults to 1 if + `frac` is None. + frac + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturned``, and + zero rows will raise ``NoRowsReturned`` (both inherit from ``RowsException``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you absolutely + require row-iteration you should strongly prefer ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialises all frame data as a list of rows. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.rows() + [(1, 2), (3, 4), (5, 6)] + >>> df.rows(named=True) + [{\'a\': 1, \'b\': 2}, {\'a\': 3, \'b\': 4}, {\'a\': 5, \'b\': 6}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + An iterator of tuples (default) or dictionaries of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, *args, **kwargs) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> Self: + ''' + Return Pearson product-moment correlation coefficients. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + kwargs + keyword arguments are passed to numpy corrcoef + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def update(self, other: DataFrame, on: None | str | Sequence[str] = ..., how: str = ...) -> Self: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'Left\' will keep the left table rows as is. + \'Inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/expr/expr deleted file mode 100644 index 58d2f89..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/expr/expr +++ /dev/null @@ -1,248 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import date, datetime, time, timedelta -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Struct as Struct, UInt32 as UInt32, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.lazyframe import LazyFrame as LazyFrame -from polars.polars import PyExpr as PyExpr -from polars.series import Series as Series -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: set[str] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def xor(self, other: Any) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: Expr, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, reverse: bool = ...) -> Self: ... - def cumprod(self, reverse: bool = ...) -> Self: ... - def cummin(self, reverse: bool = ...) -> Self: ... - def cummax(self, reverse: bool = ...) -> Self: ... - def cumcount(self, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], strict: bool = ...) -> Self: ... - def sort(self, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ..., descending: bool = ...) -> Self: ... - def arg_sort(self, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, periods: int, fill_value: int | float | bool | str | Expr | list[Any]) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, fill_value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., skip_nulls: bool = ..., pass_name: bool = ..., *, strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: Expr | str) -> Self: ... - def is_between(self, start: Expr | datetime | date | time | int | float | str, end: Expr | datetime | date | time | int | float | str, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def argsort(self, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def rank(self, method: RankMethod = ..., descending: bool = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, bias: bool = ...) -> Self: ... - def kurtosis(self, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, min_val: int | float, max_val: int | float) -> Self: ... - def clip_min(self, min_val: int | float) -> Self: ... - def clip_max(self, max_val: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def reshape(self, dims: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def entropy(self, base: float = ..., normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., parallel: bool = ...) -> Self: ... - def set_sorted(self, descending: bool = ...) -> Self: ... - def list(self) -> Self: ... - def shrink_dtype(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ...) -> Self: ... - @property - def arr(self) -> ExprListNameSpace: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/expr/expr.pyi new file mode 100644 index 0000000..500a321 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/expr/expr.pyi @@ -0,0 +1,5482 @@ +#: version 0.16.15 +import P +import np as np +from datetime import timedelta +from polars.datatypes.classes import Struct as Struct, UInt32 as UInt32 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +py_arg_where: builtin_function_or_method + +class Expr: + _pyexpr: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def and_(self, *others: Any) -> Self: + """Method equivalent of operator expression ``expr & other1 & other2 & ...``.""" + def or_(self, *others: Any) -> Self: + """Method equivalent of operator expression ``expr | other1 | other2 | ...``.""" + def eq(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr == other``.""" + def ge(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr >= other``.""" + def gt(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr > other``.""" + def le(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr <= other``.""" + def lt(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr < other``.""" + def ne(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr != other``.""" + def add(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr + other``.""" + def floordiv(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr // other``.""" + def mod(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr % other``.""" + def mul(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr * other``.""" + def sub(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr - other``.""" + def truediv(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr / other``.""" + def xor(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr ^ other``.""" + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def all(self) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + pl.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the output of an expression. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + >>> df.select( + ... [ + ... pl.col("a").alias("bar"), + ... pl.col("b").alias("foo"), + ... ] + ... ) + shape: (3, 2) + ┌─────┬──────┐ + │ bar ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + + Keep original column name to undo an alias operation. + + >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent + "DuplicateError: Column with name: \'literal\' has more than one occurrences" + errors. + + >>> df.select([(pl.lit(10) / pl.all()).keep_name()]) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr): + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + ... + >>> df = pl.DataFrame({"a": ["a: 1", "b: 2", "c: 3"]}) + >>> df.with_columns(pl.col("a").pipe(extract_number)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().prefix("reverse_"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def map_alias(self, *args, **kwargs) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps root name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [3, 4], + ... } + ... ) + >>> df.select( + ... pl.all().reverse().map_alias(lambda colName: colName + "_reverse") + ... ) + shape: (2, 2) + ┌───────────┬───────────┐ + │ A_reverse ┆ B_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════════╪═══════════╡ + │ 2 ┆ 4 │ + │ 1 ┆ 3 │ + └───────────┴───────────┘ + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: Expr, upcast: bool = ...) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └─────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any], strict: bool = ...) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, *args, **kwargs) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + descending + Return the smallest elements. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").top_k(descending=True).alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, *args, **kwargs) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, periods: int, fill_value: int | float | bool | str | Expr | list[Any]) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + fill_value + Fill None values with the result of this expression. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill(1, "a")) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.fill_null(strategy="zero") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ 0 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(99) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ 99 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(strategy="forward") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def fill_nan(self, fill_value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.fill_nan("zero") + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪══════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ zero │ + │ zero ┆ 6.0 │ + └──────┴──────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 6 │ + │ null ┆ 6 │ + └──────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + Degrees of freedom. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + Degrees of freedom. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self, maintain_order: bool = ...) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, polars will assume that + the dtype remains unchanged. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considdered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + .. deprecated:: 0.15.16 + `Expr.explode` will be removed in favour of `Expr.arr.explode` and + `Expr.str.explode`. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + ExprStringNameSpace.explode : Explode a string column. + + """ + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`Expr.head`. + + Parameters + ---------- + n + Number of rows to return. + + """ + def pow(self, exponent: int | float | Series | Expr) -> Self: + ''' + Raise expression to the power of exponent. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").pow(3)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ 8.0 │ + │ 27.0 │ + │ 64.0 │ + └──────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: Expr | str) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, start: Expr | datetime | date | time | int | float | str, end: Expr | datetime | date | time | int | float | str, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + start + Lower bound value (can be an expression or literal). + end + Upper bound value (can be an expression or literal). + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self, signed: bool = ...) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill nulls with linear interpolation over missing values. + + Can also be used to regrid data to a new grid - see examples below. + + Parameters + ---------- + method : {\'linear\', \'linear\'} + Interpolation method + + Examples + -------- + >>> # Fill nulls with linear interpolation + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_min(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + └──────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_max(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 8.0, 6.0, 2.0, 16.0, 10.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_mean(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.5 │ + │ 7.0 │ + │ 4.0 │ + │ 9.0 │ + │ 13.0 │ + └──────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_sum(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 3.0 │ + │ 5.0 │ + │ 7.0 │ + │ 9.0 │ + │ 11.0 │ + └──────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Compute a rolling standard deviation. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_std(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 1.527525 │ + │ 2.0 │ + └──────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_var(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 2.333333 │ + │ 4.0 │ + └──────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_median(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_quantile(quantile=0.33, window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + └──────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int, bias: bool = ...) -> Self: + """ + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + """ + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def argsort(self, *args, **kwargs) -> Self: + ''' + Get the index values that would sort this column. + + Alias for :func:`Expr.arg_sort`. + + .. deprecated:: 0.16.5 + `Expr.argsort` will be removed in favour of `Expr.arg_sort`. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").argsort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def rank(self, *args, **kwargs) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self, bias: bool = ...) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self, fisher: bool = ..., bias: bool = ...) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, min_val: int | float, max_val: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + min_val + Minimum value. + max_val + Maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, min_val: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + min_val + Minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, max_val: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + max_val + Maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def reshape(self, dims: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dims + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `frac`. Defaults to 1 if + `frac` is None. + frac + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(frac=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self, multithreaded: bool = ..., sort: bool = ...) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def entropy(self, base: float = ..., normalize: bool = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ..., parallel: bool = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self, *args, **kwargs) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def list(self) -> Self: + ''' + Aggregate to list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().list()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + ''' + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/lazyframe/frame deleted file mode 100644 index 5f1b596..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/lazyframe/frame +++ /dev/null @@ -1,127 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.expr.expr import Expr as Expr -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.series import Series as Series -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, PolarsExprType as PolarsExprType, PythonLiteral as PythonLiteral, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath -from typing import Any, Callable, Concatenate, Iterable, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: set[str] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - @classmethod - def _scan_csv(cls, source: str, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, func: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_plan(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_optimized_plan(self, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, optimized: bool = ..., *, show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def profile(self, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy[Self]: ... - def groupby_rolling(self, index_column: str, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> LazyGroupBy[Self]: ... - def groupby_dynamic(self, index_column: str, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> LazyGroupBy[Self]: ... - def join_asof(self, other: LazyFrame, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., suffix: str = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, exprs: str | PolarsExprType | PythonLiteral | Series | Iterable[str | PolarsExprType | PythonLiteral | Series | None] | None = ..., *more_exprs: str | PolarsExprType | PythonLiteral | Series | None, **named_exprs: str | PolarsExprType | PythonLiteral | Series | None) -> Self: ... - def with_context(self, other): ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, periods: int, fill_value: Expr | int | str | float) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, fill_value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, maintain_order: bool = ..., subset: str | Sequence[str] | None = ..., keep: UniqueKeepStrategy = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def update(self, other: LazyFrame, on: None | str | Sequence[str] = ..., how: str = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..453ce9c --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/lazyframe/frame.pyi @@ -0,0 +1,3128 @@ +#: version 0.16.15 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> Self: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, func: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + func + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def describe_plan(self) -> str: + ''' + Create a string representation of the unoptimized query plan. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + .. deprecated:: 0.16.10 + Use ``LazyFrame.explain`` + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).describe_plan() # doctest: +SKIP + + ''' + def describe_optimized_plan(self) -> str: + """Create a string representation of the optimized query plan.""" + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, *args, **kwargs) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + """ + Return the `k` largest elements. + + If 'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the 'k' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + """ + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: str) -> LazyGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime) + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: str) -> LazyGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + * \'datapoint\': Start from the first encountered data point. + * \'monday\': Start the window on the monday before the first data point. + + See Also + -------- + groupby_rolling + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... low=datetime(2021, 12, 16), + ... high=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... low=datetime(2021, 12, 16), + ... high=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ) + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ) + >>> population.join_asof( + ... gdp, left_on="date", right_on="date", strategy="backward" + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, exprs: str | PolarsExprType | PythonLiteral | Series | Iterable[str | PolarsExprType | PythonLiteral | Series | None] | None = ..., *more_exprs: str | PolarsExprType | PythonLiteral | Series | None, **named_exprs: str | PolarsExprType | PythonLiteral | Series | None) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, periods: int, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + fill_value + fill None values with the result of this expression. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(periods=1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, fill_value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + fill_value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their standard deviation value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their variance value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique().collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"]).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last").collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible ot running in the streaming + engine. That means that the function must produce the same result if it + is exectuted on batches as it would when executed on the full dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, *args, **kwargs) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def update(self, other: LazyFrame, on: None | str | Sequence[str] = ..., how: str = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'Left\' will keep the left table rows as is. + \'Inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/series/series deleted file mode 100644 index 475dde0..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/series/series +++ /dev/null @@ -1,300 +0,0 @@ - -from datetime import date, datetime, time, timedelta -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.expr.expr import Expr as Expr -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: set[str] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex, nan_to_null: bool = ...) -> Self: ... - @classmethod - def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Any) -> Self: ... - def __sub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __floordiv__(self, other): ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, power: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __iter__(self) -> SeriesIter: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: ... - def qcut(self, quantiles: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: ... - def hist(self, bins: list[float] | None = ..., bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, in_place: bool = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, reverse: bool = ...) -> Series: ... - def cummin(self, reverse: bool = ...) -> Series: ... - def cumprod(self, reverse: bool = ...) -> Series: ... - def cumsum(self, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, descending: bool = ..., *, in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ..., descending: bool = ...) -> Series: ... - def arg_sort(self, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def argsort(self, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, start: Expr | datetime | date | time | int | float | str, end: Expr | datetime | date | time | int | float | str, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, fill_value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, periods: int, fill_value: int | Expr) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., descending: bool = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, bias: bool = ...) -> float | None: ... - def kurtosis(self, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, min_val: int | float, max_val: int | float) -> Series: ... - def clip_min(self, min_val: int | float) -> Series: ... - def clip_max(self, max_val: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ...) -> Self: ... - def reshape(self, dims: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - @property - def arr(self) -> ListNameSpace: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -class SeriesIter: - len: int - i: int - s: Series - def __init__(self, length: int, s: Series) -> None: ... - def __iter__(self) -> SeriesIter: ... - def __next__(self) -> Any: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/series/series.pyi new file mode 100644 index 0000000..003877b --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.15/polars/series/series.pyi @@ -0,0 +1,3905 @@ +#: version 0.16.15 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, time, timedelta +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar, Collection, NoReturn, Sequence + +TYPE_CHECKING: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array, rechunk: bool = ...) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, *args, **kwargs) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + @classmethod + def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def le(self, other: Any) -> Self: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self: + """Method equivalent of operator expression ``series == other``.""" + def ne(self, other: Any) -> Self: + """Method equivalent of operator expression ``series != other``.""" + def ge(self, other: Any) -> Self: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame: ... + def __sub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Series: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame: ... + def __mod__(self, other: Any) -> Series: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, power: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __iter__(self) -> SeriesIter: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self) -> Any: + ''' + Return the series as a scalar. + + Equivalent to ``s[0]``, with a check that the shape is (1,). + + Examples + -------- + >>> s = pl.Series("a", [1]) + >>> s.item() + 1 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def any(self) -> bool: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self) -> bool: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (6, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ count ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ unique ┆ 4 │ + │ null_count ┆ 1 │ + │ count ┆ 5 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: + ''' + Bin values into discrete values. + + Parameters + ---------- + bins + Bins to create. + labels + Labels to assign to the bins. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut(bins=[-1, 1]) + shape: (12, 3) + ┌──────┬─────────────┬──────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪══════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ + │ 1.5 ┆ inf ┆ (1.0, inf] │ + │ 2.0 ┆ inf ┆ (1.0, inf] │ + │ 2.5 ┆ inf ┆ (1.0, inf] │ + └──────┴─────────────┴──────────────┘ + + ''' + def qcut(self, quantiles: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: + ''' + Bin values into discrete values based on their quantiles. + + Parameters + ---------- + quantiles + Quaniles to create. + We expect quantiles ``0.0 <= quantile <= 1`` + labels + Labels to assign to the quantiles. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut([0.0, 0.25, 0.75]) + shape: (8, 3) + ┌──────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪═══════════════╡ + │ -5.0 ┆ -5.0 ┆ (-inf, -5.0] │ + │ -4.0 ┆ -3.25 ┆ (-5.0, -3.25] │ + │ -3.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1.0 ┆ inf ┆ (0.25, inf] │ + │ 2.0 ┆ inf ┆ (0.25, inf] │ + └──────┴─────────────┴───────────────┘ + + ''' + def hist(self, bins: list[float] | None = ..., bin_count: int | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self, sort: bool = ...) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ..., normalize: bool = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ..., parallel: bool = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Return a copy of the Series with a new alias/name. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> srs = pl.Series("x", [1, 2, 3]) + >>> new_aliased_srs = srs.alias("y") + + ''' + def rename(self, name: str, in_place: bool = ...) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self, reverse: bool = ...) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self, reverse: bool = ...) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self, reverse: bool = ...) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self, reverse: bool = ...) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series, append_chunks: bool = ...) -> Series: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.append(s2) + shape: (6,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self) -> Series: + """ + Return the `k` largest elements. + + If 'descending=True` the smallest elements will be given. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + descending + Return the smallest elements. + + """ + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def argsort(self, *args, **kwargs) -> Series: + """ + Get the index values that would sort this Series. + + Alias for :func:`Series.arg_sort`. + + .. deprecated:: 0.16.5 + `Series.argsort` will be removed in favour of `Series.arg_sort`. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + """ + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self, maintain_order: bool = ...) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self, *args, **kwargs) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + .. deprecated:: 0.15.16 + `Series.explode` will be removed in favour of `Series.arr.explode` and + `Series.str.explode`. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ListNameSpace.explode : Explode a list column. + StringNameSpace.explode : Explode a string column. + + """ + def series_equal(self, other: Series, null_equal: bool = ..., strict: bool = ...) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self, use_pyarrow: bool = ...) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, start: Expr | datetime | date | time | int | float | str, end: Expr | datetime | date | time | int | float | str, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + start + Lower bound value (can be an expression or literal). + end + Upper bound value (can be an expression or literal). + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self, ignore_nulls: bool = ...) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use pyarrow for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, fill_value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + fill_value + Value used to fill nan values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, periods: int, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + fill_value + Fill None values with the result of this expression. + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int, bias: bool = ...) -> Series: + ''' + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_skew(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 0.0 + 0.0 + 0.381802 + 0.0 + ] + + ''' + def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `frac`. Defaults to 1 if + `frac` is None. + frac + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self, in_place: bool = ...) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self, signed: bool = ...) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self, bias: bool = ...) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self, fisher: bool = ..., bias: bool = ...) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, min_val: int | float, max_val: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + min_val + Minimum value. + max_val + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, min_val: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + min_val + Minimum value. + + ''' + def clip_max(self, max_val: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + max_val + Maximum value. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ''' + def reshape(self, dims: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dims + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def str(self): ... + @property + def struct(self): ... + +class SeriesIter: + def __init__(self, length: int, s: Series) -> None: ... + def __iter__(self) -> SeriesIter: ... + def __next__(self) -> Any: ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/dataframe/frame deleted file mode 100644 index 3bdee90..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/dataframe/frame +++ /dev/null @@ -1,278 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from io import BytesIO, IOBase -from pathlib import Path -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturned as NoRowsReturned, TooManyRowsReturned as TooManyRowsReturned -from polars.expr import Expr as Expr -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.lazyframe import LazyFrame as LazyFrame -from polars.polars import PyDataFrame as PyDataFrame -from polars.series import Series as Series -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr -from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: set[str] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], infer_schema_length: int | None = ..., schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @columns.setter - def columns(self, columns: Sequence[str]) -> None: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: ... - def _div(self, other: Any, floordiv: bool) -> Self: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int]) -> Series: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, str]) -> Series: ... - @overload - def __getitem__(self, item: tuple[int, int]) -> Any: ... - @overload - def __getitem__(self, item: tuple[int, str]) -> Any: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - @overload - def write_json(self, file: None = ..., pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | str | Path, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, include_header: bool = ..., header_name: str = ..., column_names: Iterator[str] | Sequence[str] | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: ... - @overload - def glimpse(self, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, return_as_string: Literal[True]) -> str: ... - def describe(self) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def frame_equal(self, other: DataFrame, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_col: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def pipe(self, func: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy[Self]: ... - def groupby_rolling(self, index_column: str, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> RollingGroupBy[Self]: ... - def groupby_dynamic(self, index_column: str, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> DynamicGroupBy[Self]: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: DataFrame, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., suffix: str = ...) -> Self: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., inference_size: int = ...) -> Self: ... - def hstack(self, columns: list[Series] | DataFrame, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, fill_value: Expr | int | float | None) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr = ..., maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, periods: int, fill_value: int | str | float) -> Self: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ...) -> Self: ... - def unique(self, maintain_order: bool = ..., subset: str | Sequence[str] | None = ..., keep: UniqueKeepStrategy = ...) -> Self: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, named: Literal[True]) -> list[dict[str, Any]]: ... - @overload - def iter_rows(self, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> Self: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs): ... - def merge_sorted(self, other: DataFrame, key: str) -> Self: ... - def update(self, other: DataFrame, on: None | str | Sequence[str] = ..., how: str = ...) -> Self: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/dataframe/frame.pyi new file mode 100644 index 0000000..f33fb4d --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/dataframe/frame.pyi @@ -0,0 +1,5290 @@ +#: version 0.16.16 +import P +import np as np +import pa as pa +import pd as pd +from _io import BytesIO + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Object as Object, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturned as NoRowsReturned, TooManyRowsReturned as TooManyRowsReturned +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr +from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +FLOAT_DTYPES: frozenset +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: ClassVar[set] = ... + columns: list[str] + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], infer_schema_length: int | None = ..., schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, *args, **kwargs) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, *args, **kwargs) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, *args, **kwargs) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, *args, **kwargs) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> Self: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> Self: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> Self: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int] | tuple[MultiRowSelector, str] | tuple[int, int] | tuple[int, str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self) -> Any: + ''' + Return the dataframe as a scalar. + + Equivalent to ``df[0,0]``, with a check that the shape is (1,1). + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> result = df.select((pl.col("a") * pl.col("b")).sum()) + >>> result + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 32 │ + └─────┘ + >>> result.item() + 32 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> numpy_array = df.to_numpy() + >>> type(numpy_array) + + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. Valid names are: + "average", "count_nums", "count", "max", "min", "std_dev", "sum", "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + + Notes + ----- + Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, see: + https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should be adjacent to each other. + Two other polars-specific keys are available to help define where the sparkline + appears in the table: "insert_after", and "insert_before". The value associated + with these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to append to or create in the SQL database. + connection_uri + Connection uri, for example + + * "postgresql://username:password@server:port/database" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional generator/iterator that yields column names. Will be used to + replace the columns in the DataFrame. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self) -> Self: + ''' + Summary statistics for a DataFrame. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (7, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, *args, **kwargs) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + """ + Return the `k` largest elements. + + If 'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the 'k' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + """ + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_col: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_col + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, func: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + func + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: str) -> RollingGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic* + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime) + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: str) -> DynamicGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + - \'window\': Truncate the start of the window with the \'every\' argument. + - \'datapoint\': Start from the first encountered data point. + - \'monday\': Start the window on the monday before the first data point. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... low=datetime(2021, 12, 16), + ... high=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... low=datetime(2021, 12, 16), + ... high=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ) + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ) + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ) + >>> population.join_asof( + ... gdp, left_on="date", right_on="date", strategy="backward" + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame) -> Self: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + It is better to implement this with an expression: + + >>> df.select([pl.col("foo") * 2, pl.col("bar") * 3]) # doctest: +IGNORE_RESULT + + Return a Series by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, df: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + df + DataFrame to stack. + in_place + Modify in place + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: Self) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this + `DataFrame` `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. + For instance during online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a + query. For instance when you read in multiple files and when to store them in a + single `DataFrame`. In the latter case, finish the sequence of `vstack` + operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current DataFrame, with zero to \'n\' rows. + + Returns a DataFrame with identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, fill_value: Expr | int | float | None) -> Self: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + fill_value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function : {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + A predefined aggregate function str or an expression. + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar") + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.arange(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ … ┆ … │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, *args, **kwargs) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, periods: int, fill_value: int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + fill_value + fill None values with this value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + Degrees of freedom + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + Degrees of freedom + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"]) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last") + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `frac`. Defaults to 1 if + `frac` is None. + frac + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturned``, and + zero rows will raise ``NoRowsReturned`` (both inherit from ``RowsException``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you absolutely + require row-iteration you should strongly prefer ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialises all frame data as a list of rows. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.rows() + [(1, 2), (3, 4), (5, 6)] + >>> df.rows(named=True) + [{\'a\': 1, \'b\': 2}, {\'a\': 3, \'b\': 4}, {\'a\': 5, \'b\': 6}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + An iterator of tuples (default) or dictionaries of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, *args, **kwargs) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> Self: + ''' + Return Pearson product-moment correlation coefficients. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + kwargs + keyword arguments are passed to numpy corrcoef + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def update(self, other: DataFrame, on: None | str | Sequence[str] = ..., how: str = ...) -> Self: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'Left\' will keep the left table rows as is. + \'Inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/expr/expr deleted file mode 100644 index 58d2f89..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/expr/expr +++ /dev/null @@ -1,248 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import date, datetime, time, timedelta -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Struct as Struct, UInt32 as UInt32, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.lazyframe import LazyFrame as LazyFrame -from polars.polars import PyExpr as PyExpr -from polars.series import Series as Series -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: set[str] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def xor(self, other: Any) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: Expr, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, reverse: bool = ...) -> Self: ... - def cumprod(self, reverse: bool = ...) -> Self: ... - def cummin(self, reverse: bool = ...) -> Self: ... - def cummax(self, reverse: bool = ...) -> Self: ... - def cumcount(self, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], strict: bool = ...) -> Self: ... - def sort(self, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ..., descending: bool = ...) -> Self: ... - def arg_sort(self, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, periods: int, fill_value: int | float | bool | str | Expr | list[Any]) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, fill_value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., skip_nulls: bool = ..., pass_name: bool = ..., *, strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: Expr | str) -> Self: ... - def is_between(self, start: Expr | datetime | date | time | int | float | str, end: Expr | datetime | date | time | int | float | str, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def argsort(self, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def rank(self, method: RankMethod = ..., descending: bool = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, bias: bool = ...) -> Self: ... - def kurtosis(self, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, min_val: int | float, max_val: int | float) -> Self: ... - def clip_min(self, min_val: int | float) -> Self: ... - def clip_max(self, max_val: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def reshape(self, dims: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def entropy(self, base: float = ..., normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., parallel: bool = ...) -> Self: ... - def set_sorted(self, descending: bool = ...) -> Self: ... - def list(self) -> Self: ... - def shrink_dtype(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ...) -> Self: ... - @property - def arr(self) -> ExprListNameSpace: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/expr/expr.pyi new file mode 100644 index 0000000..a7026ea --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/expr/expr.pyi @@ -0,0 +1,5482 @@ +#: version 0.16.16 +import P +import np as np +from datetime import timedelta +from polars.datatypes.classes import Struct as Struct, UInt32 as UInt32 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +py_arg_where: builtin_function_or_method + +class Expr: + _pyexpr: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def and_(self, *others: Any) -> Self: + """Method equivalent of operator expression ``expr & other1 & other2 & ...``.""" + def or_(self, *others: Any) -> Self: + """Method equivalent of operator expression ``expr | other1 | other2 | ...``.""" + def eq(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr == other``.""" + def ge(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr >= other``.""" + def gt(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr > other``.""" + def le(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr <= other``.""" + def lt(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr < other``.""" + def ne(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr != other``.""" + def add(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr + other``.""" + def floordiv(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr // other``.""" + def mod(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr % other``.""" + def mul(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr * other``.""" + def sub(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr - other``.""" + def truediv(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr / other``.""" + def xor(self, other: Any) -> Self: + """Method equivalent of operator expression ``expr ^ other``.""" + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def all(self) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + pl.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the output of an expression. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + >>> df.select( + ... [ + ... pl.col("a").alias("bar"), + ... pl.col("b").alias("foo"), + ... ] + ... ) + shape: (3, 2) + ┌─────┬──────┐ + │ bar ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + + Keep original column name to undo an alias operation. + + >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent + "DuplicateError: Column with name: \'literal\' has more than one occurrences" + errors. + + >>> df.select([(pl.lit(10) / pl.all()).keep_name()]) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr): + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + ... + >>> df = pl.DataFrame({"a": ["a: 1", "b: 2", "c: 3"]}) + >>> df.with_columns(pl.col("a").pipe(extract_number)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().prefix("reverse_"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def map_alias(self, *args, **kwargs) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps root name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [3, 4], + ... } + ... ) + >>> df.select( + ... pl.all().reverse().map_alias(lambda colName: colName + "_reverse") + ... ) + shape: (2, 2) + ┌───────────┬───────────┐ + │ A_reverse ┆ B_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════════╪═══════════╡ + │ 2 ┆ 4 │ + │ 1 ┆ 3 │ + └───────────┴───────────┘ + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: Expr, upcast: bool = ...) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └─────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any], strict: bool = ...) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, *args, **kwargs) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + descending + Return the smallest elements. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").top_k(descending=True).alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, *args, **kwargs) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, periods: int, fill_value: int | float | bool | str | Expr | list[Any]) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + fill_value + Fill None values with the result of this expression. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill(1, "a")) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.fill_null(strategy="zero") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ 0 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(99) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ 99 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(strategy="forward") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def fill_nan(self, fill_value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.fill_nan("zero") + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪══════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ zero │ + │ zero ┆ 6.0 │ + └──────┴──────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 6 │ + │ null ┆ 6 │ + └──────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + Degrees of freedom. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + Degrees of freedom. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self, maintain_order: bool = ...) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, polars will assume that + the dtype remains unchanged. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + .. deprecated:: 0.15.16 + `Expr.explode` will be removed in favour of `Expr.arr.explode` and + `Expr.str.explode`. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + ExprStringNameSpace.explode : Explode a string column. + + """ + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`Expr.head`. + + Parameters + ---------- + n + Number of rows to return. + + """ + def pow(self, exponent: int | float | Series | Expr) -> Self: + ''' + Raise expression to the power of exponent. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").pow(3)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ 8.0 │ + │ 27.0 │ + │ 64.0 │ + └──────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: Expr | str) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, start: Expr | datetime | date | time | int | float | str, end: Expr | datetime | date | time | int | float | str, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + start + Lower bound value (can be an expression or literal). + end + Upper bound value (can be an expression or literal). + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self, signed: bool = ...) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill nulls with linear interpolation over missing values. + + Can also be used to regrid data to a new grid - see examples below. + + Parameters + ---------- + method : {\'linear\', \'linear\'} + Interpolation method + + Examples + -------- + >>> # Fill nulls with linear interpolation + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_min(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + └──────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_max(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 8.0, 6.0, 2.0, 16.0, 10.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_mean(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.5 │ + │ 7.0 │ + │ 4.0 │ + │ 9.0 │ + │ 13.0 │ + └──────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_sum(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 3.0 │ + │ 5.0 │ + │ 7.0 │ + │ 9.0 │ + │ 11.0 │ + └──────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Compute a rolling standard deviation. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_std(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 1.527525 │ + │ 2.0 │ + └──────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_var(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 2.333333 │ + │ 4.0 │ + └──────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_median(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_quantile(quantile=0.33, window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + └──────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int, bias: bool = ...) -> Self: + """ + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + """ + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def argsort(self, *args, **kwargs) -> Self: + ''' + Get the index values that would sort this column. + + Alias for :func:`Expr.arg_sort`. + + .. deprecated:: 0.16.5 + `Expr.argsort` will be removed in favour of `Expr.arg_sort`. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").argsort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def rank(self, *args, **kwargs) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self, bias: bool = ...) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self, fisher: bool = ..., bias: bool = ...) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, min_val: int | float, max_val: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + min_val + Minimum value. + max_val + Maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, min_val: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + min_val + Minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, max_val: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + max_val + Maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def reshape(self, dims: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dims + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `frac`. Defaults to 1 if + `frac` is None. + frac + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(frac=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self, multithreaded: bool = ..., sort: bool = ...) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def entropy(self, base: float = ..., normalize: bool = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ..., parallel: bool = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self, *args, **kwargs) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def list(self) -> Self: + ''' + Aggregate to list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().list()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + ''' + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/lazyframe/frame deleted file mode 100644 index 5f1b596..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/lazyframe/frame +++ /dev/null @@ -1,127 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.expr.expr import Expr as Expr -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.series import Series as Series -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, PolarsExprType as PolarsExprType, PythonLiteral as PythonLiteral, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath -from typing import Any, Callable, Concatenate, Iterable, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: set[str] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - @classmethod - def _scan_csv(cls, source: str, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, func: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_plan(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_optimized_plan(self, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, optimized: bool = ..., *, show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def profile(self, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy[Self]: ... - def groupby_rolling(self, index_column: str, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> LazyGroupBy[Self]: ... - def groupby_dynamic(self, index_column: str, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> LazyGroupBy[Self]: ... - def join_asof(self, other: LazyFrame, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., suffix: str = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, exprs: str | PolarsExprType | PythonLiteral | Series | Iterable[str | PolarsExprType | PythonLiteral | Series | None] | None = ..., *more_exprs: str | PolarsExprType | PythonLiteral | Series | None, **named_exprs: str | PolarsExprType | PythonLiteral | Series | None) -> Self: ... - def with_context(self, other): ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, periods: int, fill_value: Expr | int | str | float) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, fill_value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, maintain_order: bool = ..., subset: str | Sequence[str] | None = ..., keep: UniqueKeepStrategy = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def update(self, other: LazyFrame, on: None | str | Sequence[str] = ..., how: str = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..fa5125f --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/lazyframe/frame.pyi @@ -0,0 +1,3129 @@ +#: version 0.16.16 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> Self: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, func: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + func + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def describe_plan(self) -> str: + ''' + Create a string representation of the unoptimized query plan. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + .. deprecated:: 0.16.10 + Use ``LazyFrame.explain`` + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).describe_plan() # doctest: +SKIP + + ''' + def describe_optimized_plan(self) -> str: + """Create a string representation of the optimized query plan.""" + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, *args, **kwargs) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + """ + Return the `k` largest elements. + + If 'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the 'k' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + """ + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: str) -> LazyGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime) + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: str) -> LazyGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + * \'datapoint\': Start from the first encountered data point. + * \'monday\': Start the window on the monday before the first data point. + + See Also + -------- + groupby_rolling + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... low=datetime(2021, 12, 16), + ... high=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... low=datetime(2021, 12, 16), + ... high=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ) + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ) + >>> population.join_asof( + ... gdp, left_on="date", right_on="date", strategy="backward" + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, exprs: str | PolarsExprType | PythonLiteral | Series | Iterable[str | PolarsExprType | PythonLiteral | Series | None] | None = ..., *more_exprs: str | PolarsExprType | PythonLiteral | Series | None, **named_exprs: str | PolarsExprType | PythonLiteral | Series | None) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, periods: int, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + fill_value + fill None values with the result of this expression. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(periods=1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, fill_value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + fill_value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their standard deviation value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their variance value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique().collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"]).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last").collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, *args, **kwargs) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def update(self, other: LazyFrame, on: None | str | Sequence[str] = ..., how: str = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'Left\' will keep the left table rows as is. + \'Inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/series/series deleted file mode 100644 index 475dde0..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/series/series +++ /dev/null @@ -1,300 +0,0 @@ - -from datetime import date, datetime, time, timedelta -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.expr.expr import Expr as Expr -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: set[str] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex, nan_to_null: bool = ...) -> Self: ... - @classmethod - def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Any) -> Self: ... - def __sub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __floordiv__(self, other): ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, power: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __iter__(self) -> SeriesIter: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: ... - def qcut(self, quantiles: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: ... - def hist(self, bins: list[float] | None = ..., bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, in_place: bool = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, reverse: bool = ...) -> Series: ... - def cummin(self, reverse: bool = ...) -> Series: ... - def cumprod(self, reverse: bool = ...) -> Series: ... - def cumsum(self, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, descending: bool = ..., *, in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ..., descending: bool = ...) -> Series: ... - def arg_sort(self, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def argsort(self, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, start: Expr | datetime | date | time | int | float | str, end: Expr | datetime | date | time | int | float | str, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, fill_value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, periods: int, fill_value: int | Expr) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., descending: bool = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, bias: bool = ...) -> float | None: ... - def kurtosis(self, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, min_val: int | float, max_val: int | float) -> Series: ... - def clip_min(self, min_val: int | float) -> Series: ... - def clip_max(self, max_val: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ...) -> Self: ... - def reshape(self, dims: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - @property - def arr(self) -> ListNameSpace: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -class SeriesIter: - len: int - i: int - s: Series - def __init__(self, length: int, s: Series) -> None: ... - def __iter__(self) -> SeriesIter: ... - def __next__(self) -> Any: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/series/series.pyi new file mode 100644 index 0000000..56184fe --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.16/polars/series/series.pyi @@ -0,0 +1,3905 @@ +#: version 0.16.16 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, time, timedelta +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar, Collection, NoReturn, Sequence + +TYPE_CHECKING: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array, rechunk: bool = ...) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, *args, **kwargs) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + @classmethod + def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def le(self, other: Any) -> Self: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self: + """Method equivalent of operator expression ``series == other``.""" + def ne(self, other: Any) -> Self: + """Method equivalent of operator expression ``series != other``.""" + def ge(self, other: Any) -> Self: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame: ... + def __sub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Series: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame: ... + def __mod__(self, other: Any) -> Series: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, power: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __iter__(self) -> SeriesIter: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self) -> Any: + ''' + Return the series as a scalar. + + Equivalent to ``s[0]``, with a check that the shape is (1,). + + Examples + -------- + >>> s = pl.Series("a", [1]) + >>> s.item() + 1 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def any(self) -> bool: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self) -> bool: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (6, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ count ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ unique ┆ 4 │ + │ null_count ┆ 1 │ + │ count ┆ 5 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: + ''' + Bin values into discrete values. + + Parameters + ---------- + bins + Bins to create. + labels + Labels to assign to the bins. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut(bins=[-1, 1]) + shape: (12, 3) + ┌──────┬─────────────┬──────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪══════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ + │ 1.5 ┆ inf ┆ (1.0, inf] │ + │ 2.0 ┆ inf ┆ (1.0, inf] │ + │ 2.5 ┆ inf ┆ (1.0, inf] │ + └──────┴─────────────┴──────────────┘ + + ''' + def qcut(self, quantiles: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: + ''' + Bin values into discrete values based on their quantiles. + + Parameters + ---------- + quantiles + Quaniles to create. + We expect quantiles ``0.0 <= quantile <= 1`` + labels + Labels to assign to the quantiles. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut([0.0, 0.25, 0.75]) + shape: (8, 3) + ┌──────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪═══════════════╡ + │ -5.0 ┆ -5.0 ┆ (-inf, -5.0] │ + │ -4.0 ┆ -3.25 ┆ (-5.0, -3.25] │ + │ -3.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1.0 ┆ inf ┆ (0.25, inf] │ + │ 2.0 ┆ inf ┆ (0.25, inf] │ + └──────┴─────────────┴───────────────┘ + + ''' + def hist(self, bins: list[float] | None = ..., bin_count: int | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self, sort: bool = ...) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ..., normalize: bool = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ..., parallel: bool = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Return a copy of the Series with a new alias/name. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> srs = pl.Series("x", [1, 2, 3]) + >>> new_aliased_srs = srs.alias("y") + + ''' + def rename(self, name: str, in_place: bool = ...) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self, reverse: bool = ...) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self, reverse: bool = ...) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self, reverse: bool = ...) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self, reverse: bool = ...) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series, append_chunks: bool = ...) -> Series: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.append(s2) + shape: (6,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self) -> Series: + """ + Return the `k` largest elements. + + If 'descending=True` the smallest elements will be given. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + descending + Return the smallest elements. + + """ + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def argsort(self, *args, **kwargs) -> Series: + """ + Get the index values that would sort this Series. + + Alias for :func:`Series.arg_sort`. + + .. deprecated:: 0.16.5 + `Series.argsort` will be removed in favour of `Series.arg_sort`. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + """ + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self, maintain_order: bool = ...) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self, *args, **kwargs) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + .. deprecated:: 0.15.16 + `Series.explode` will be removed in favour of `Series.arr.explode` and + `Series.str.explode`. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ListNameSpace.explode : Explode a list column. + StringNameSpace.explode : Explode a string column. + + """ + def series_equal(self, other: Series, null_equal: bool = ..., strict: bool = ...) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self, use_pyarrow: bool = ...) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, start: Expr | datetime | date | time | int | float | str, end: Expr | datetime | date | time | int | float | str, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + start + Lower bound value (can be an expression or literal). + end + Upper bound value (can be an expression or literal). + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self, ignore_nulls: bool = ...) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use pyarrow for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, fill_value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + fill_value + Value used to fill nan values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, periods: int, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + fill_value + Fill None values with the result of this expression. + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int, bias: bool = ...) -> Series: + ''' + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_skew(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 0.0 + 0.0 + 0.381802 + 0.0 + ] + + ''' + def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `frac`. Defaults to 1 if + `frac` is None. + frac + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self, in_place: bool = ...) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self, signed: bool = ...) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self, bias: bool = ...) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self, fisher: bool = ..., bias: bool = ...) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, min_val: int | float, max_val: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + min_val + Minimum value. + max_val + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, min_val: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + min_val + Minimum value. + + ''' + def clip_max(self, max_val: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + max_val + Maximum value. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ''' + def reshape(self, dims: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dims + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def str(self): ... + @property + def struct(self): ... + +class SeriesIter: + def __init__(self, length: int, s: Series) -> None: ... + def __iter__(self) -> SeriesIter: ... + def __next__(self) -> Any: ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/dataframe/frame deleted file mode 100644 index acc920a..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/dataframe/frame +++ /dev/null @@ -1,280 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from io import BytesIO, IOBase -from pathlib import Path -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturned as NoRowsReturned, TooManyRowsReturned as TooManyRowsReturned -from polars.expr import Expr as Expr -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.lazyframe import LazyFrame as LazyFrame -from polars.polars import PyDataFrame as PyDataFrame -from polars.series import Series as Series -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr -from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: set[str] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], infer_schema_length: int | None = ..., schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @columns.setter - def columns(self, columns: Sequence[str]) -> None: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: ... - def _div(self, other: Any, floordiv: bool) -> Self: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int]) -> Series: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, str]) -> Series: ... - @overload - def __getitem__(self, item: tuple[int, int]) -> Any: ... - @overload - def __getitem__(self, item: tuple[int, str]) -> Any: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | str | Path, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, include_header: bool = ..., header_name: str = ..., column_names: Iterator[str] | Sequence[str] | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: ... - @overload - def glimpse(self, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, return_as_string: Literal[True]) -> str: ... - def describe(self) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def frame_equal(self, other: DataFrame, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_col: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def pipe(self, func: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy[Self]: ... - def groupby_rolling(self, index_column: str, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> RollingGroupBy[Self]: ... - def groupby_dynamic(self, index_column: str, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> DynamicGroupBy[Self]: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: DataFrame, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., suffix: str = ...) -> Self: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., inference_size: int = ...) -> Self: ... - def hstack(self, columns: list[Series] | DataFrame, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, fill_value: Expr | int | float | None) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, periods: int, fill_value: int | str | float) -> Self: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ...) -> Self: ... - def unique(self, maintain_order: bool = ..., subset: str | Sequence[str] | None = ..., keep: UniqueKeepStrategy = ...) -> Self: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, named: Literal[True]) -> list[dict[str, Any]]: ... - @overload - def iter_rows(self, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> Self: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs): ... - def merge_sorted(self, other: DataFrame, key: str) -> Self: ... - def update(self, other: DataFrame, on: None | str | Sequence[str] = ..., how: str = ...) -> Self: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/dataframe/frame.pyi new file mode 100644 index 0000000..56f2cf7 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/dataframe/frame.pyi @@ -0,0 +1,5398 @@ +#: version 0.16.18 +import P +import np as np +import pa as pa +import pd as pd +from _io import BytesIO + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Object as Object, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturned as NoRowsReturned, TooManyRowsReturned as TooManyRowsReturned +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr +from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +FLOAT_DTYPES: frozenset +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: ClassVar[set] = ... + columns: list[str] + def __init__(self, *args, **kwargs) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], infer_schema_length: int | None = ..., schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., schema_overrides: SchemaDict | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, *args, **kwargs) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, *args, **kwargs) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, *args, **kwargs) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, *args, **kwargs) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> Self: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> Self: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> Self: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int] | tuple[MultiRowSelector, str] | tuple[int, int] | tuple[int, str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self) -> Any: + ''' + Return the dataframe as a scalar. + + Equivalent to ``df[0,0]``, with a check that the shape is (1,1). + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> result = df.select((pl.col("a") * pl.col("b")).sum()) + >>> result + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 32 │ + └─────┘ + >>> result.item() + 32 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> numpy_array = df.to_numpy() + >>> type(numpy_array) + + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to append to or create in the SQL database. + connection_uri + Connection uri, for example + + * "postgresql://username:password@server:port/database" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional generator/iterator that yields column names. Will be used to + replace the columns in the DataFrame. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self) -> Self: + ''' + Summary statistics for a DataFrame. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (7, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, *args, **kwargs) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + """ + Return the `k` largest elements. + + If 'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the 'k' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + """ + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_col: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_col + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, func: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + func + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: str) -> RollingGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic* + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime) + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: str) -> DynamicGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + - \'window\': Truncate the start of the window with the \'every\' argument. + - \'datapoint\': Start from the first encountered data point. + - \'monday\': Start the window on the monday before the first data point. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... low=datetime(2021, 12, 16), + ... high=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... low=datetime(2021, 12, 16), + ... high=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ) + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ) + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ) + >>> population.join_asof( + ... gdp, left_on="date", right_on="date", strategy="backward" + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame) -> Self: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + It is better to implement this with an expression: + + >>> df.select([pl.col("foo") * 2, pl.col("bar") * 3]) # doctest: +IGNORE_RESULT + + Return a Series by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, df: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + df + DataFrame to stack. + in_place + Modify in place + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: Self) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this + `DataFrame` `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. + For instance during online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a + query. For instance when you read in multiple files and when to store them in a + single `DataFrame`. In the latter case, finish the sequence of `vstack` + operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current DataFrame, with zero to \'n\' rows. + + Returns a DataFrame with identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, fill_value: Expr | int | float | None) -> Self: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + fill_value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function : {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + A predefined aggregate function str or an expression. + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar") + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.arange(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ … ┆ … │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, *args, **kwargs) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, periods: int, fill_value: int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + fill_value + fill None values with this value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"]) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last") + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `frac`. Defaults to 1 if + `frac` is None. + frac + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturned``, and + zero rows will raise ``NoRowsReturned`` (both inherit from ``RowsException``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you absolutely + require row-iteration you should strongly prefer ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialises all frame data as a list of rows. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.rows() + [(1, 2), (3, 4), (5, 6)] + >>> df.rows(named=True) + [{\'a\': 1, \'b\': 2}, {\'a\': 3, \'b\': 4}, {\'a\': 5, \'b\': 6}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + An iterator of tuples (default) or dictionaries of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, *args, **kwargs) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> Self: + ''' + Return Pearson product-moment correlation coefficients. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + kwargs + keyword arguments are passed to numpy corrcoef + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def update(self, other: DataFrame, on: None | str | Sequence[str] = ..., how: str = ...) -> Self: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'Left\' will keep the left table rows as is. + \'Inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/expr/expr deleted file mode 100644 index 879cb83..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/expr/expr +++ /dev/null @@ -1,248 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Struct as Struct, UInt32 as UInt32, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.lazyframe import LazyFrame as LazyFrame -from polars.polars import PyExpr as PyExpr -from polars.series import Series as Series -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: set[str] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: Expr, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, reverse: bool = ...) -> Self: ... - def cumprod(self, reverse: bool = ...) -> Self: ... - def cummin(self, reverse: bool = ...) -> Self: ... - def cummax(self, reverse: bool = ...) -> Self: ... - def cumcount(self, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], strict: bool = ...) -> Self: ... - def sort(self, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ..., descending: bool = ...) -> Self: ... - def arg_sort(self, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, periods: int, fill_value: int | float | bool | str | Expr | list[Any]) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, fill_value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., skip_nulls: bool = ..., pass_name: bool = ..., *, strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: Expr | str) -> Self: ... - def is_between(self, start: IntoExpr, end: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def argsort(self, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def rank(self, method: RankMethod = ..., descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, bias: bool = ...) -> Self: ... - def kurtosis(self, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, min_val: int | float, max_val: int | float) -> Self: ... - def clip_min(self, min_val: int | float) -> Self: ... - def clip_max(self, max_val: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def reshape(self, dims: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def entropy(self, base: float = ..., normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., parallel: bool = ...) -> Self: ... - def set_sorted(self, descending: bool = ...) -> Self: ... - def list(self) -> Self: ... - def shrink_dtype(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., dtype: PolarsDataType | None = ...) -> Self: ... - @property - def arr(self) -> ExprListNameSpace: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/expr/expr.pyi new file mode 100644 index 0000000..a77fe08 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/expr/expr.pyi @@ -0,0 +1,5744 @@ +#: version 0.16.18 +import P +import np as np +from datetime import timedelta +from polars.datatypes.classes import Struct as Struct, UInt32 as UInt32 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def all(self) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + pl.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the output of an expression. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + >>> df.select( + ... [ + ... pl.col("a").alias("bar"), + ... pl.col("b").alias("foo"), + ... ] + ... ) + shape: (3, 2) + ┌─────┬──────┐ + │ bar ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + + Keep original column name to undo an alias operation. + + >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent + "DuplicateError: Column with name: \'literal\' has more than one occurrences" + errors. + + >>> df.select([(pl.lit(10) / pl.all()).keep_name()]) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr): + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + ... + >>> df = pl.DataFrame({"a": ["a: 1", "b: 2", "c: 3"]}) + >>> df.with_columns(pl.col("a").pipe(extract_number)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().prefix("reverse_"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def map_alias(self, *args, **kwargs) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps root name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [3, 4], + ... } + ... ) + >>> df.select( + ... pl.all().reverse().map_alias(lambda colName: colName + "_reverse") + ... ) + shape: (2, 2) + ┌───────────┬───────────┐ + │ A_reverse ┆ B_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════════╪═══════════╡ + │ 2 ┆ 4 │ + │ 1 ┆ 3 │ + └───────────┴───────────┘ + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: Expr, upcast: bool = ...) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └─────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self, reverse: bool = ...) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any], strict: bool = ...) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, *args, **kwargs) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + descending + Return the smallest elements. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").top_k(descending=True).alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, *args, **kwargs) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, periods: int, fill_value: int | float | bool | str | Expr | list[Any]) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + fill_value + Fill None values with the result of this expression. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill(1, "a")) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.fill_null(strategy="zero") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ 0 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(99) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ 99 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(strategy="forward") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def fill_nan(self, fill_value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.fill_nan("zero") + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪══════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ zero │ + │ zero ┆ 6.0 │ + └──────┴──────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 6 │ + │ null ┆ 6 │ + └──────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self, maintain_order: bool = ...) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, polars will assume that + the dtype remains unchanged. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + .. deprecated:: 0.15.16 + `Expr.explode` will be removed in favour of `Expr.arr.explode` and + `Expr.str.explode`. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + ExprStringNameSpace.explode : Explode a string column. + + """ + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`Expr.head`. + + Parameters + ---------- + n + Number of rows to return. + + """ + def and_(self, *others: Any) -> Self: + '''Method equivalent of logical "and" operator ``expr & other & ...``.''' + def or_(self, *others: Any) -> Self: + '''Method equivalent of logical "or" operator ``expr | other | ...``.''' + def eq(self, other: Any) -> Self: + """Method equivalent of equality operator ``expr == other``.""" + def ge(self, other: Any) -> Self: + '''Method equivalent of "greater than or equal" operator ``expr >= other``.''' + def gt(self, other: Any) -> Self: + '''Method equivalent of "greater than" operator ``expr > other``.''' + def le(self, other: Any) -> Self: + '''Method equivalent of "less than or equal" operator ``expr <= other``.''' + def lt(self, other: Any) -> Self: + '''Method equivalent of "less than" operator ``expr < other``.''' + def ne(self, other: Any) -> Self: + """Method equivalent of inequality operator ``expr != other``.""" + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + integer, float, or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("x").add(2).alias("x+2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x+2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ 5 │ + │ 4 ┆ 6 │ + │ 5 ┆ 7 │ + └─────┴─────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("add")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ add │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Other integer or float value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("x").floordiv(2).alias("x//2")) + shape: (5, 2) + ┌─────┬──────┐ + │ x ┆ x//2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ 0 │ + │ 2 ┆ 1 │ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + │ 5 ┆ 2 │ + └─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Other integer or float value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Other integer or float value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mul(2).alias("x*2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x*2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 2 ┆ 4 │ + │ 3 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Other integer or float value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").sub(2).alias("x-2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x-2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ -2 │ + │ 1 ┆ -1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Other integer or float value; accepts expression input. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("x").truediv(2).alias("x/2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x/2 │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 1.0 │ + │ 3 ┆ 1.5 │ + │ 4 ┆ 2.0 │ + │ 5 ┆ 2.5 │ + └─────┴─────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + The exponent; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").pow(3)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ 8.0 │ + │ 27.0 │ + │ 64.0 │ + └──────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of logical exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Other integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("xor")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ xor │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame({"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("x ^ y"), + ... ) + shape: (4, 5) + ┌─────┬─────┬──────────┬──────────┬───────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ x ^ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ str ┆ i64 │ + ╞═════╪═════╪══════════╪══════════╪═══════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 │ + └─────┴─────┴──────────┴──────────┴───────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: Expr | str) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, start: IntoExpr, end: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + start + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + end + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self, signed: bool = ...) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill nulls with linear interpolation over missing values. + + Can also be used to regrid data to a new grid - see examples below. + + Parameters + ---------- + method : {\'linear\', \'linear\'} + Interpolation method + + Examples + -------- + >>> # Fill nulls with linear interpolation + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_min(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + └──────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_max(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 8.0, 6.0, 2.0, 16.0, 10.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_mean(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.5 │ + │ 7.0 │ + │ 4.0 │ + │ 9.0 │ + │ 13.0 │ + └──────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_sum(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 3.0 │ + │ 5.0 │ + │ 7.0 │ + │ 9.0 │ + │ 11.0 │ + └──────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Compute a rolling standard deviation. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_std(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 1.527525 │ + │ 2.0 │ + └──────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_var(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 2.333333 │ + │ 4.0 │ + └──────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_median(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_quantile(quantile=0.33, window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + └──────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int, bias: bool = ...) -> Self: + """ + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + """ + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def argsort(self, *args, **kwargs) -> Self: + ''' + Get the index values that would sort this column. + + Alias for :func:`Expr.arg_sort`. + + .. deprecated:: 0.16.5 + `Expr.argsort` will be removed in favour of `Expr.arg_sort`. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").argsort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def rank(self, *args, **kwargs) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self, bias: bool = ...) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self, fisher: bool = ..., bias: bool = ...) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, min_val: int | float, max_val: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + min_val + Minimum value. + max_val + Maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, min_val: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + min_val + Minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, max_val: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + max_val + Maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def reshape(self, dims: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dims + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `frac`. Defaults to 1 if + `frac` is None. + frac + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(frac=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self, multithreaded: bool = ..., sort: bool = ...) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def entropy(self, base: float = ..., normalize: bool = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ..., parallel: bool = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self, *args, **kwargs) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def list(self) -> Self: + ''' + Aggregate to list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().list()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + dtype + Override output dtype. + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override output dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/lazyframe/frame deleted file mode 100644 index 5f1b596..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/lazyframe/frame +++ /dev/null @@ -1,127 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.expr.expr import Expr as Expr -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.series import Series as Series -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, PolarsExprType as PolarsExprType, PythonLiteral as PythonLiteral, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath -from typing import Any, Callable, Concatenate, Iterable, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: set[str] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - @classmethod - def _scan_csv(cls, source: str, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, func: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_plan(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_optimized_plan(self, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, optimized: bool = ..., *, show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def profile(self, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy[Self]: ... - def groupby_rolling(self, index_column: str, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> LazyGroupBy[Self]: ... - def groupby_dynamic(self, index_column: str, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> LazyGroupBy[Self]: ... - def join_asof(self, other: LazyFrame, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., suffix: str = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, exprs: str | PolarsExprType | PythonLiteral | Series | Iterable[str | PolarsExprType | PythonLiteral | Series | None] | None = ..., *more_exprs: str | PolarsExprType | PythonLiteral | Series | None, **named_exprs: str | PolarsExprType | PythonLiteral | Series | None) -> Self: ... - def with_context(self, other): ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, periods: int, fill_value: Expr | int | str | float) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, fill_value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, maintain_order: bool = ..., subset: str | Sequence[str] | None = ..., keep: UniqueKeepStrategy = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def update(self, other: LazyFrame, on: None | str | Sequence[str] = ..., how: str = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..4ac1f77 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/lazyframe/frame.pyi @@ -0,0 +1,3143 @@ +#: version 0.16.18 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> Self: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, func: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + func + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def describe_plan(self) -> str: + ''' + Create a string representation of the unoptimized query plan. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + .. deprecated:: 0.16.10 + Use ``LazyFrame.explain`` + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).describe_plan() # doctest: +SKIP + + ''' + def describe_optimized_plan(self) -> str: + """Create a string representation of the optimized query plan.""" + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, *args, **kwargs) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + """ + Return the `k` largest elements. + + If 'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the 'k' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + """ + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: str) -> LazyGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime) + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: str) -> LazyGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + * \'datapoint\': Start from the first encountered data point. + * \'monday\': Start the window on the monday before the first data point. + + See Also + -------- + groupby_rolling + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... low=datetime(2021, 12, 16), + ... high=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... low=datetime(2021, 12, 16), + ... high=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ) + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ) + >>> population.join_asof( + ... gdp, left_on="date", right_on="date", strategy="backward" + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, exprs: str | PolarsExprType | PythonLiteral | Series | Iterable[str | PolarsExprType | PythonLiteral | Series | None] | None = ..., *more_exprs: str | PolarsExprType | PythonLiteral | Series | None, **named_exprs: str | PolarsExprType | PythonLiteral | Series | None) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, periods: int, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + fill_value + fill None values with the result of this expression. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(periods=1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, fill_value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + fill_value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique().collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"]).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last").collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, *args, **kwargs) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def update(self, other: LazyFrame, on: None | str | Sequence[str] = ..., how: str = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'Left\' will keep the left table rows as is. + \'Inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/series/series deleted file mode 100644 index ebd0bae..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/series/series +++ /dev/null @@ -1,302 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.expr.expr import Expr as Expr -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: set[str] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex, nan_to_null: bool = ...) -> Self: ... - @classmethod - def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Any) -> Self: ... - def __sub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __floordiv__(self, other): ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __iter__(self) -> SeriesIter: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: ... - def qcut(self, quantiles: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: ... - def hist(self, bins: list[float] | None = ..., bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, in_place: bool = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, reverse: bool = ...) -> Series: ... - def cummin(self, reverse: bool = ...) -> Series: ... - def cumprod(self, reverse: bool = ...) -> Series: ... - def cumsum(self, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, descending: bool = ..., *, in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ..., descending: bool = ...) -> Series: ... - def arg_sort(self, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def argsort(self, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, start: IntoExpr, end: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, fill_value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, periods: int, fill_value: int | Expr) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, bias: bool = ...) -> float | None: ... - def kurtosis(self, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, min_val: int | float, max_val: int | float) -> Series: ... - def clip_min(self, min_val: int | float) -> Series: ... - def clip_max(self, max_val: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dims: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - @property - def arr(self) -> ListNameSpace: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -class SeriesIter: - len: int - i: int - s: Series - def __init__(self, length: int, s: Series) -> None: ... - def __iter__(self) -> SeriesIter: ... - def __next__(self) -> Any: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/series/series.pyi new file mode 100644 index 0000000..a0d74da --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.16.18/polars/series/series.pyi @@ -0,0 +1,3977 @@ +#: version 0.16.18 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar, Collection, NoReturn, Sequence + +TYPE_CHECKING: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array, rechunk: bool = ...) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, *args, **kwargs) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + @classmethod + def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def le(self, other: Any) -> Self: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self: + """Method equivalent of operator expression ``series == other``.""" + def ne(self, other: Any) -> Self: + """Method equivalent of operator expression ``series != other``.""" + def ge(self, other: Any) -> Self: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame: ... + def __sub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Series: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame: ... + def __mod__(self, other: Any) -> Series: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __iter__(self) -> SeriesIter: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self) -> Any: + ''' + Return the series as a scalar. + + Equivalent to ``s[0]``, with a check that the shape is (1,). + + Examples + -------- + >>> s = pl.Series("a", [1]) + >>> s.item() + 1 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def any(self) -> bool: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self) -> bool: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (6, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ count ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ unique ┆ 4 │ + │ null_count ┆ 1 │ + │ count ┆ 5 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: + ''' + Bin values into discrete values. + + Parameters + ---------- + bins + Bins to create. + labels + Labels to assign to the bins. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut(bins=[-1, 1]) + shape: (12, 3) + ┌──────┬─────────────┬──────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪══════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ + │ 1.5 ┆ inf ┆ (1.0, inf] │ + │ 2.0 ┆ inf ┆ (1.0, inf] │ + │ 2.5 ┆ inf ┆ (1.0, inf] │ + └──────┴─────────────┴──────────────┘ + + ''' + def qcut(self, quantiles: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: + ''' + Bin values into discrete values based on their quantiles. + + Parameters + ---------- + quantiles + Quaniles to create. + We expect quantiles ``0.0 <= quantile <= 1`` + labels + Labels to assign to the quantiles. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut([0.0, 0.25, 0.75]) + shape: (8, 3) + ┌──────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪═══════════════╡ + │ -5.0 ┆ -5.0 ┆ (-inf, -5.0] │ + │ -4.0 ┆ -3.25 ┆ (-5.0, -3.25] │ + │ -3.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1.0 ┆ inf ┆ (0.25, inf] │ + │ 2.0 ┆ inf ┆ (0.25, inf] │ + └──────┴─────────────┴───────────────┘ + + ''' + def hist(self, bins: list[float] | None = ..., bin_count: int | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self, sort: bool = ...) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ..., normalize: bool = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ..., parallel: bool = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Return a copy of the Series with a new alias/name. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> srs = pl.Series("x", [1, 2, 3]) + >>> new_aliased_srs = srs.alias("y") + + ''' + def rename(self, name: str, in_place: bool = ...) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self, reverse: bool = ...) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self, reverse: bool = ...) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self, reverse: bool = ...) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self, reverse: bool = ...) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series, append_chunks: bool = ...) -> Series: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.append(s2) + shape: (6,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self) -> Series: + """ + Return the `k` largest elements. + + If 'descending=True` the smallest elements will be given. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + descending + Return the smallest elements. + + """ + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def argsort(self, *args, **kwargs) -> Series: + """ + Get the index values that would sort this Series. + + Alias for :func:`Series.arg_sort`. + + .. deprecated:: 0.16.5 + `Series.argsort` will be removed in favour of `Series.arg_sort`. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + """ + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self, maintain_order: bool = ...) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self, *args, **kwargs) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + .. deprecated:: 0.15.16 + `Series.explode` will be removed in favour of `Series.arr.explode` and + `Series.str.explode`. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ListNameSpace.explode : Explode a list column. + StringNameSpace.explode : Explode a string column. + + """ + def series_equal(self, other: Series, null_equal: bool = ..., strict: bool = ...) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self, use_pyarrow: bool = ...) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, start: IntoExpr, end: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + start + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + end + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self, ignore_nulls: bool = ...) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use pyarrow for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, fill_value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + fill_value + Value used to fill nan values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, periods: int, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + fill_value + Fill None values with the result of this expression. + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., center: bool = ...) -> Series: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int, bias: bool = ...) -> Series: + ''' + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_skew(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 0.0 + 0.0 + 0.381802 + 0.0 + ] + + ''' + def sample(self, n: int | None = ..., frac: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `frac`. Defaults to 1 if + `frac` is None. + frac + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self, in_place: bool = ...) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self, signed: bool = ...) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self, bias: bool = ...) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self, fisher: bool = ..., bias: bool = ...) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, min_val: int | float, max_val: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + min_val + Minimum value. + max_val + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, min_val: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + min_val + Minimum value. + + ''' + def clip_max(self, max_val: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + max_val + Maximum value. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + dtype + Override output dtype. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override output dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dims: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dims + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def str(self): ... + @property + def struct(self): ... + +class SeriesIter: + def __init__(self, length: int, s: Series) -> None: ... + def __iter__(self) -> SeriesIter: ... + def __next__(self) -> Any: ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/dataframe/frame deleted file mode 100644 index c7c27d8..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/dataframe/frame +++ /dev/null @@ -1,282 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from io import BytesIO, IOBase -from pathlib import Path -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.expr import Expr as Expr -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.lazyframe import LazyFrame as LazyFrame -from polars.polars import PyDataFrame as PyDataFrame -from polars.series import Series as Series -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr -from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: set[str] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @columns.setter - def columns(self, names: Sequence[str]) -> None: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: ... - def _div(self, other: Any, floordiv: bool) -> Self: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: Iterator[str] | Sequence[str] | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy[Self]: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> RollingGroupBy[Self]: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> DynamicGroupBy[Self]: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ...) -> Self: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> Self: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: Expr | int | float | None) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> Self: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ...) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> Self: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs): ... - def merge_sorted(self, other: DataFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/dataframe/frame.pyi new file mode 100644 index 0000000..6573de9 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/dataframe/frame.pyi @@ -0,0 +1,5599 @@ +#: version 0.17.10 +import P +import np as np +import pa as pa +import pd as pd +from _io import BytesIO + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr +from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> Self: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> numpy_array = df.to_numpy() + >>> type(numpy_array) + + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to append to or create in the SQL database. + connection_uri + Connection uri, for example + + * "postgresql://username:password@server:port/database" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional generator/iterator that yields column names. Will be used to + replace the columns in the DataFrame. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic* + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + - \'window\': Truncate the start of the window with the \'every\' argument. + - \'datapoint\': Start from the first encountered data point. + - \'monday\': Start the window on the monday before the first data point. + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + It is better to implement this with an expression: + + >>> df.select([pl.col("foo") * 2, pl.col("bar") * 3]) # doctest: +IGNORE_RESULT + + Return a Series by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, df: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + df + DataFrame to stack. + in_place + Modify in place + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: Self) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this + `DataFrame` `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. + For instance during online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a + query. For instance when you read in multiple files and when to store them in a + single `DataFrame`. In the latter case, finish the sequence of `vstack` + operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current DataFrame, with zero to \'n\' rows. + + Returns a DataFrame with identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> Self: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function : {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + A predefined aggregate function str or an expression. + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot( + ... values="baz", index="foo", columns="bar", aggregate_function="first" + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.arange(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ E ┆ 4 │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, by: str | Iterable[str], *more_by: str) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialises all frame data as a list of rows. + item: Return dataframe element as a scalar. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.rows() + [(1, 2), (3, 4), (5, 6)] + >>> df.rows(named=True) + [{\'a\': 1, \'b\': 2}, {\'a\': 3, \'b\': 4}, {\'a\': 5, \'b\': 6}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters in your use-case + you should export to a different format. + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + An iterator of tuples (default) or dictionaries (if named) of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> Self: + ''' + Return Pearson product-moment correlation coefficients. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + keyword arguments are passed to numpy corrcoef + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/expr/expr deleted file mode 100644 index 2df2d26..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/expr/expr +++ /dev/null @@ -1,250 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.lazyframe import LazyFrame as LazyFrame -from polars.polars import PyExpr as PyExpr -from polars.series import Series as Series -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: set[str] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: Expr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: int | float | bool | str | Expr | list[Any], *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: Expr | str) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def arr(self) -> ExprListNameSpace: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/expr/expr.pyi new file mode 100644 index 0000000..5b0c336 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/expr/expr.pyi @@ -0,0 +1,6122 @@ +#: version 0.17.10 +import P +import np as np +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def all(self) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + pl.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the output of an expression. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + >>> df.select( + ... [ + ... pl.col("a").alias("bar"), + ... pl.col("b").alias("foo"), + ... ] + ... ) + shape: (3, 2) + ┌─────┬──────┐ + │ bar ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + + Keep original column name to undo an alias operation. + + >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent + "DuplicateError: Column with name: \'literal\' has more than one occurrences" + errors. + + >>> df.select([(pl.lit(10) / pl.all()).keep_name()]) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr): + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + ... + >>> df = pl.DataFrame({"a": ["a: 1", "b: 2", "c: 3"]}) + >>> df.with_columns(pl.col("a").pipe(extract_number)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().prefix("reverse_"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps root name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [3, 4], + ... } + ... ) + >>> df.select( + ... pl.all().reverse().map_alias(lambda colName: colName + "_reverse") + ... ) + shape: (2, 2) + ┌───────────┬───────────┐ + │ A_reverse ┆ B_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════════╪═══════════╡ + │ 2 ┆ 4 │ + │ 1 ┆ 3 │ + └───────────┴───────────┘ + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: Expr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └─────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: int | float | bool | str | Expr | list[Any]) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.fill_null(strategy="zero") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ 0 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(99) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ 99 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(strategy="forward") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.fill_nan("zero") + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪══════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ zero │ + │ zero ┆ 6.0 │ + └──────┴──────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 6 │ + │ null ┆ 6 │ + └──────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + ExprStringNameSpace.explode : Explode a string column. + + """ + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of logical exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: Expr | str) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill nulls with linear interpolation over missing values. + + Can also be used to regrid data to a new grid - see examples below. + + Parameters + ---------- + method : {\'linear\', \'linear\'} + Interpolation method + + Examples + -------- + >>> # Fill nulls with linear interpolation + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_min(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + └──────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_max(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 8.0, 6.0, 2.0, 16.0, 10.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_mean(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.5 │ + │ 7.0 │ + │ 4.0 │ + │ 9.0 │ + │ 13.0 │ + └──────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_sum(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 3.0 │ + │ 5.0 │ + │ 7.0 │ + │ 9.0 │ + │ 11.0 │ + └──────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_std(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 1.527525 │ + │ 2.0 │ + └──────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_var(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 2.333333 │ + │ 4.0 │ + └──────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_median(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_quantile(quantile=0.33, window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + └──────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + """ + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + """ + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/lazyframe/frame deleted file mode 100644 index be5c5cd..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/lazyframe/frame +++ /dev/null @@ -1,131 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.expr.expr import Expr as Expr -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.series import Series as Series -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath -from typing import Any, Callable, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: set[str] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_plan(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_optimized_plan(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy[Self]: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> LazyGroupBy[Self]: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> LazyGroupBy[Self]: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other): ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..e256914 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/lazyframe/frame.pyi @@ -0,0 +1,3314 @@ +#: version 0.17.10 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> Self: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def describe_plan(self) -> str: + ''' + Create a string representation of the unoptimized query plan. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + .. deprecated:: 0.16.10 + Use ``LazyFrame.explain`` + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).describe_plan() # doctest: +SKIP + + ''' + def describe_optimized_plan(self) -> str: + """Create a string representation of the optimized query plan.""" + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.groupby_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + * \'datapoint\': Start from the first encountered data point. + * \'monday\': Start the window on the monday before the first data point. + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_rolling + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/series/series deleted file mode 100644 index 1ca4475..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/series/series +++ /dev/null @@ -1,310 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.expr.expr import Expr as Expr -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Generator, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: set[str] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - @classmethod - def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - def __floordiv__(self, other): ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, maintain_order: bool = ...) -> DataFrame: ... - def qcut(self, quantiles: list[float], *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - @property - def arr(self) -> ListNameSpace: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/series/series.pyi new file mode 100644 index 0000000..95409cb --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.10/polars/series/series.pyi @@ -0,0 +1,4032 @@ +#: version 0.17.10 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + @classmethod + def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def le(self, other: Any) -> Self: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self: + """Method equivalent of operator expression ``series == other``.""" + def ne(self, other: Any) -> Self: + """Method equivalent of operator expression ``series != other``.""" + def ge(self, other: Any) -> Self: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def any(self) -> bool: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self) -> bool: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ + │ 25% ┆ 2.0 │ + │ 75% ┆ 4.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ...) -> DataFrame: + ''' + Bin values into discrete values. + + Parameters + ---------- + bins + Bins to create. + labels + Labels to assign to the bins. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1]) + shape: (12, 3) + ┌──────┬─────────────┬──────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪══════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ + │ 1.5 ┆ inf ┆ (1.0, inf] │ + │ 2.0 ┆ inf ┆ (1.0, inf] │ + │ 2.5 ┆ inf ┆ (1.0, inf] │ + └──────┴─────────────┴──────────────┘ + + ''' + def qcut(self, quantiles: list[float]) -> DataFrame: + ''' + Bin values into discrete values based on their quantiles. + + Parameters + ---------- + quantiles + Quaniles to create. + We expect quantiles ``0.0 <= quantile <= 1`` + labels + Labels to assign to the quantiles. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut([0.0, 0.25, 0.75]) + shape: (8, 3) + ┌──────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪═══════════════╡ + │ -5.0 ┆ -5.0 ┆ (-inf, -5.0] │ + │ -4.0 ┆ -3.25 ┆ (-5.0, -3.25] │ + │ -3.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1.0 ┆ inf ┆ (0.25, inf] │ + │ 2.0 ┆ inf ┆ (0.25, inf] │ + └──────┴─────────────┴───────────────┘ + + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Return a copy of the Series with a new alias/name. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> srs = pl.Series("x", [1, 2, 3]) + >>> new_aliased_srs = srs.alias("y") + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Series: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.append(s2) + shape: (6,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ListNameSpace.explode : Explode a list column. + StringNameSpace.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use pyarrow for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + ''' + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_skew(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 0.0 + 0.0 + 0.381802 + 0.0 + ] + + ''' + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/dataframe/frame deleted file mode 100644 index c7c27d8..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/dataframe/frame +++ /dev/null @@ -1,282 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from io import BytesIO, IOBase -from pathlib import Path -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.expr import Expr as Expr -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.lazyframe import LazyFrame as LazyFrame -from polars.polars import PyDataFrame as PyDataFrame -from polars.series import Series as Series -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr -from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: set[str] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @columns.setter - def columns(self, names: Sequence[str]) -> None: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: ... - def _div(self, other: Any, floordiv: bool) -> Self: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: Iterator[str] | Sequence[str] | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy[Self]: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> RollingGroupBy[Self]: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> DynamicGroupBy[Self]: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ...) -> Self: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> Self: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: Expr | int | float | None) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> Self: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ...) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> Self: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs): ... - def merge_sorted(self, other: DataFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/dataframe/frame.pyi new file mode 100644 index 0000000..688edd9 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/dataframe/frame.pyi @@ -0,0 +1,5605 @@ +#: version 0.17.11 +import P +import np as np +import pa as pa +import pd as pd +from _io import BytesIO + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr +from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> Self: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> numpy_array = df.to_numpy() + >>> type(numpy_array) + + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to append to or create in the SQL database. + connection_uri + Connection uri, for example + + * "postgresql://username:password@server:port/database" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional generator/iterator that yields column names. Will be used to + replace the columns in the DataFrame. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic* + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + - \'window\': Truncate the start of the window with the \'every\' argument. + - \'datapoint\': Start from the first encountered data point. + - \'monday\': Start the window on the monday before the first data point. + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + It is better to implement this with an expression: + + >>> df.select([pl.col("foo") * 2, pl.col("bar") * 3]) # doctest: +IGNORE_RESULT + + Return a Series by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, df: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + df + DataFrame to stack. + in_place + Modify in place + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: Self) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this + `DataFrame` `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. + For instance during online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a + query. For instance when you read in multiple files and when to store them in a + single `DataFrame`. In the latter case, finish the sequence of `vstack` + operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current DataFrame, with zero to \'n\' rows. + + Returns a DataFrame with identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> Self: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function : {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + A predefined aggregate function str or an expression. + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot( + ... values="baz", index="foo", columns="bar", aggregate_function="first" + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.arange(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ E ┆ 4 │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, by: str | Iterable[str], *more_by: str) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialises all frame data as a list of rows. + item: Return dataframe element as a scalar. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.rows() + [(1, 2), (3, 4), (5, 6)] + >>> df.rows(named=True) + [{\'a\': 1, \'b\': 2}, {\'a\': 3, \'b\': 4}, {\'a\': 5, \'b\': 6}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters in your use-case + you should export to a different format. + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + An iterator of tuples (default) or dictionaries (if named) of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> Self: + ''' + Return Pearson product-moment correlation coefficients. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + keyword arguments are passed to numpy corrcoef + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/expr/expr deleted file mode 100644 index 2df2d26..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/expr/expr +++ /dev/null @@ -1,250 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.lazyframe import LazyFrame as LazyFrame -from polars.polars import PyExpr as PyExpr -from polars.series import Series as Series -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: set[str] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: Expr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: int | float | bool | str | Expr | list[Any], *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: Expr | str) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def arr(self) -> ExprListNameSpace: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/expr/expr.pyi new file mode 100644 index 0000000..4dbde2b --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/expr/expr.pyi @@ -0,0 +1,6122 @@ +#: version 0.17.11 +import P +import np as np +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def all(self) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + pl.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the output of an expression. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + >>> df.select( + ... [ + ... pl.col("a").alias("bar"), + ... pl.col("b").alias("foo"), + ... ] + ... ) + shape: (3, 2) + ┌─────┬──────┐ + │ bar ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + + Keep original column name to undo an alias operation. + + >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent + "DuplicateError: Column with name: \'literal\' has more than one occurrences" + errors. + + >>> df.select([(pl.lit(10) / pl.all()).keep_name()]) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr): + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + ... + >>> df = pl.DataFrame({"a": ["a: 1", "b: 2", "c: 3"]}) + >>> df.with_columns(pl.col("a").pipe(extract_number)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().prefix("reverse_"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps root name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [3, 4], + ... } + ... ) + >>> df.select( + ... pl.all().reverse().map_alias(lambda colName: colName + "_reverse") + ... ) + shape: (2, 2) + ┌───────────┬───────────┐ + │ A_reverse ┆ B_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════════╪═══════════╡ + │ 2 ┆ 4 │ + │ 1 ┆ 3 │ + └───────────┴───────────┘ + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: Expr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └─────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: int | float | bool | str | Expr | list[Any]) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.fill_null(strategy="zero") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ 0 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(99) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ 99 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(strategy="forward") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.fill_nan("zero") + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪══════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ zero │ + │ zero ┆ 6.0 │ + └──────┴──────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 6 │ + │ null ┆ 6 │ + └──────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + ExprStringNameSpace.explode : Explode a string column. + + """ + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of logical exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: Expr | str) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill nulls with linear interpolation over missing values. + + Can also be used to regrid data to a new grid - see examples below. + + Parameters + ---------- + method : {\'linear\', \'linear\'} + Interpolation method + + Examples + -------- + >>> # Fill nulls with linear interpolation + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_min(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + └──────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_max(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 8.0, 6.0, 2.0, 16.0, 10.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_mean(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.5 │ + │ 7.0 │ + │ 4.0 │ + │ 9.0 │ + │ 13.0 │ + └──────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_sum(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 3.0 │ + │ 5.0 │ + │ 7.0 │ + │ 9.0 │ + │ 11.0 │ + └──────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_std(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 1.527525 │ + │ 2.0 │ + └──────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_var(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 2.333333 │ + │ 4.0 │ + └──────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_median(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_quantile(quantile=0.33, window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + └──────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + """ + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + """ + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/lazyframe/frame deleted file mode 100644 index be5c5cd..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/lazyframe/frame +++ /dev/null @@ -1,131 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.expr.expr import Expr as Expr -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.series import Series as Series -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath -from typing import Any, Callable, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: set[str] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_plan(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_optimized_plan(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy[Self]: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> LazyGroupBy[Self]: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> LazyGroupBy[Self]: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other): ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..13ad70c --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/lazyframe/frame.pyi @@ -0,0 +1,3314 @@ +#: version 0.17.11 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> Self: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def describe_plan(self) -> str: + ''' + Create a string representation of the unoptimized query plan. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + .. deprecated:: 0.16.10 + Use ``LazyFrame.explain`` + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).describe_plan() # doctest: +SKIP + + ''' + def describe_optimized_plan(self) -> str: + """Create a string representation of the optimized query plan.""" + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.groupby_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + * \'datapoint\': Start from the first encountered data point. + * \'monday\': Start the window on the monday before the first data point. + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_rolling + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/series/series deleted file mode 100644 index e311e87..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/series/series +++ /dev/null @@ -1,328 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.expr.expr import Expr as Expr -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Generator, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: set[str] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - @classmethod - def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Self: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Self: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Self: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Self: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Self: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - def __floordiv__(self, other): ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, maintain_order: bool = ...) -> DataFrame: ... - def qcut(self, quantiles: list[float], *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - @property - def arr(self) -> ListNameSpace: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/series/series.pyi new file mode 100644 index 0000000..4b70834 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.11/polars/series/series.pyi @@ -0,0 +1,4032 @@ +#: version 0.17.11 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + @classmethod + def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self | Expr: ... + def __ne__(self, other: Any) -> Self | Expr: ... + def __gt__(self, other: Any) -> Self | Expr: ... + def __lt__(self, other: Any) -> Self | Expr: ... + def __ge__(self, other: Any) -> Self | Expr: ... + def __le__(self, other: Any) -> Self | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def any(self) -> bool: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self) -> bool: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ + │ 25% ┆ 2.0 │ + │ 75% ┆ 4.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ...) -> DataFrame: + ''' + Bin values into discrete values. + + Parameters + ---------- + bins + Bins to create. + labels + Labels to assign to the bins. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1]) + shape: (12, 3) + ┌──────┬─────────────┬──────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪══════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ + │ 1.5 ┆ inf ┆ (1.0, inf] │ + │ 2.0 ┆ inf ┆ (1.0, inf] │ + │ 2.5 ┆ inf ┆ (1.0, inf] │ + └──────┴─────────────┴──────────────┘ + + ''' + def qcut(self, quantiles: list[float]) -> DataFrame: + ''' + Bin values into discrete values based on their quantiles. + + Parameters + ---------- + quantiles + Quaniles to create. + We expect quantiles ``0.0 <= quantile <= 1`` + labels + Labels to assign to the quantiles. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut([0.0, 0.25, 0.75]) + shape: (8, 3) + ┌──────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪═══════════════╡ + │ -5.0 ┆ -5.0 ┆ (-inf, -5.0] │ + │ -4.0 ┆ -3.25 ┆ (-5.0, -3.25] │ + │ -3.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1.0 ┆ inf ┆ (0.25, inf] │ + │ 2.0 ┆ inf ┆ (0.25, inf] │ + └──────┴─────────────┴───────────────┘ + + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Return a copy of the Series with a new alias/name. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> srs = pl.Series("x", [1, 2, 3]) + >>> new_aliased_srs = srs.alias("y") + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Series: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.append(s2) + shape: (6,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ListNameSpace.explode : Explode a list column. + StringNameSpace.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use pyarrow for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + ''' + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_skew(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 0.0 + 0.0 + 0.381802 + 0.0 + ] + + ''' + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/dataframe/frame deleted file mode 100644 index 4c0dd9a..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/dataframe/frame +++ /dev/null @@ -1,280 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from io import BytesIO, IOBase -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr -from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: set[str] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @columns.setter - def columns(self, names: Sequence[str]) -> None: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: ... - def _div(self, other: Any, floordiv: bool) -> Self: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: Iterator[str] | Sequence[str] | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy[Self]: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> RollingGroupBy[Self]: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> DynamicGroupBy[Self]: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ...) -> Self: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> Self: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: Expr | int | float | None) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> Self: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ...) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> Self: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs): ... - def merge_sorted(self, other: DataFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/dataframe/frame.pyi new file mode 100644 index 0000000..1a5375f --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/dataframe/frame.pyi @@ -0,0 +1,5637 @@ +#: version 0.17.12 +import P +import np as np +import pa as pa +import pd as pd +from _io import BytesIO + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr +from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> Self: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to append to or create in the SQL database. + connection_uri + Connection uri, for example + + * "postgresql://username:password@server:port/database" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional generator/iterator that yields column names. Will be used to + replace the columns in the DataFrame. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic* + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + - \'window\': Truncate the start of the window with the \'every\' argument. + - \'datapoint\': Start from the first encountered data point. + - \'monday\': Start the window on the monday before the first data point. + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + It is better to implement this with an expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a Series by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, df: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + df + DataFrame to stack. + in_place + Modify in place + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: Self) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this + `DataFrame` `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. + For instance during online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a + query. For instance when you read in multiple files and when to store them in a + single `DataFrame`. In the latter case, finish the sequence of `vstack` + operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current DataFrame, with zero to \'n\' rows. + + Returns a DataFrame with identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> Self: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function : {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + A predefined aggregate function str or an expression. + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot( + ... values="baz", index="foo", columns="bar", aggregate_function="first" + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.arange(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ E ┆ 4 │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, by: str | Iterable[str], *more_by: str) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialises all frame data as a list of rows. + item: Return dataframe element as a scalar. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.rows() + [(1, 2), (3, 4), (5, 6)] + >>> df.rows(named=True) + [{\'a\': 1, \'b\': 2}, {\'a\': 3, \'b\': 4}, {\'a\': 5, \'b\': 6}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters in your use-case + you should export to a different format. + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + An iterator of tuples (default) or dictionaries (if named) of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> Self: + ''' + Return Pearson product-moment correlation coefficients. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + keyword arguments are passed to numpy corrcoef + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/expr/expr deleted file mode 100644 index 82a3927..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/expr/expr +++ /dev/null @@ -1,249 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: set[str] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: Expr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: int | float | bool | str | Expr | list[Any], *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: Expr | str) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def arr(self) -> ExprListNameSpace: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/expr/expr.pyi new file mode 100644 index 0000000..630a7aa --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/expr/expr.pyi @@ -0,0 +1,6177 @@ +#: version 0.17.12 +import P +import np as np +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def all(self) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + pl.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the output of an expression. + + Parameters + ---------- + name + New name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + >>> df.select( + ... pl.col("a").alias("bar"), + ... pl.col("b").alias("foo"), + ... ) + shape: (3, 2) + ┌─────┬──────┐ + │ bar ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + + Keep original column name to undo an alias operation. + + >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent + "DuplicateError: Column with name: \'literal\' has more than one occurrences" + errors. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to root column name. + + See Also + -------- + alias + map_alias + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().prefix("reverse_"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to root column name. + + See Also + -------- + alias + map_alias + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps root name to new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [3, 4], + ... } + ... ) + + >>> df.select(pl.all().reverse().suffix("_reverse")).with_columns( + ... pl.all().map_alias( + ... # Remove "_reverse" suffix and convert to lower case. + ... lambda col_name: col_name.rsplit("_reverse", 1)[0].lower() + ... ) + ... ) + shape: (2, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 2 ┆ 4 ┆ 2 ┆ 4 │ + │ 1 ┆ 3 ┆ 1 ┆ 3 │ + └───────────┴───────────┴─────┴─────┘ + + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: Expr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └─────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: int | float | bool | str | Expr | list[Any]) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.fill_null(strategy="zero") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ 0 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(99) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ 99 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(strategy="forward") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.fill_nan("zero") + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪══════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ zero │ + │ zero ┆ 6.0 │ + └──────┴──────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 6 │ + │ null ┆ 6 │ + └──────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + See Also + -------- + map_dict + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + ExprStringNameSpace.explode : Explode a string column. + + """ + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of logical exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: Expr | str) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill nulls with linear interpolation over missing values. + + Can also be used to regrid data to a new grid - see examples below. + + Parameters + ---------- + method : {\'linear\', \'linear\'} + Interpolation method + + Examples + -------- + >>> # Fill nulls with linear interpolation + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_min(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + └──────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_max(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 8.0, 6.0, 2.0, 16.0, 10.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_mean(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.5 │ + │ 7.0 │ + │ 4.0 │ + │ 9.0 │ + │ 13.0 │ + └──────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_sum(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 3.0 │ + │ 5.0 │ + │ 7.0 │ + │ 9.0 │ + │ 11.0 │ + └──────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_std(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 1.527525 │ + │ 2.0 │ + └──────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_var(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 2.333333 │ + │ 4.0 │ + └──────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_median(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_quantile(quantile=0.33, window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + └──────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + """ + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + """ + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """Cache this expression so that it only is executed once per context.""" + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/lazyframe/frame deleted file mode 100644 index a28dae7..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/lazyframe/frame +++ /dev/null @@ -1,129 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr, Series as Series -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath -from typing import Any, Callable, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: set[str] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_plan(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_optimized_plan(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy[Self]: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> LazyGroupBy[Self]: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> LazyGroupBy[Self]: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other): ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..586fb97 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/lazyframe/frame.pyi @@ -0,0 +1,3314 @@ +#: version 0.17.12 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> Self: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def describe_plan(self) -> str: + ''' + Create a string representation of the unoptimized query plan. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + .. deprecated:: 0.16.10 + Use ``LazyFrame.explain`` + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).describe_plan() # doctest: +SKIP + + ''' + def describe_optimized_plan(self) -> str: + """Create a string representation of the optimized query plan.""" + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.groupby_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + * \'datapoint\': Start from the first encountered data point. + * \'monday\': Start the window on the monday before the first data point. + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_rolling + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/series/series deleted file mode 100644 index 7266a1b..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/series/series +++ /dev/null @@ -1,327 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Generator, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: set[str] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - @classmethod - def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Self: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Self: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Self: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Self: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Self: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - def __floordiv__(self, other): ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, maintain_order: bool = ...) -> DataFrame: ... - def qcut(self, quantiles: list[float], *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - @property - def arr(self) -> ListNameSpace: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/series/series.pyi new file mode 100644 index 0000000..4f6593f --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.12/polars/series/series.pyi @@ -0,0 +1,4033 @@ +#: version 0.17.12 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + @classmethod + def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self | Expr: ... + def __ne__(self, other: Any) -> Self | Expr: ... + def __gt__(self, other: Any) -> Self | Expr: ... + def __lt__(self, other: Any) -> Self | Expr: ... + def __ge__(self, other: Any) -> Self | Expr: ... + def __le__(self, other: Any) -> Self | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def any(self) -> bool: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self) -> bool: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ + │ 25% ┆ 2.0 │ + │ 75% ┆ 4.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ...) -> DataFrame: + ''' + Bin values into discrete values. + + Parameters + ---------- + bins + Bins to create. + labels + Labels to assign to the bins. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1]) + shape: (12, 3) + ┌──────┬─────────────┬──────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪══════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ + │ 1.5 ┆ inf ┆ (1.0, inf] │ + │ 2.0 ┆ inf ┆ (1.0, inf] │ + │ 2.5 ┆ inf ┆ (1.0, inf] │ + └──────┴─────────────┴──────────────┘ + + ''' + def qcut(self, quantiles: list[float]) -> DataFrame: + ''' + Bin values into discrete values based on their quantiles. + + Parameters + ---------- + quantiles + Quaniles to create. + We expect quantiles ``0.0 <= quantile <= 1`` + labels + Labels to assign to the quantiles. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut([0.0, 0.25, 0.75]) + shape: (8, 3) + ┌──────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪═══════════════╡ + │ -5.0 ┆ -5.0 ┆ (-inf, -5.0] │ + │ -4.0 ┆ -3.25 ┆ (-5.0, -3.25] │ + │ -3.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1.0 ┆ inf ┆ (0.25, inf] │ + │ 2.0 ┆ inf ┆ (0.25, inf] │ + └──────┴─────────────┴───────────────┘ + + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Return a copy of the Series with a new alias/name. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> srs = pl.Series("x", [1, 2, 3]) + >>> new_aliased_srs = srs.alias("y") + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Series: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.append(s2) + shape: (6,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ListNameSpace.explode : Explode a list column. + StringNameSpace.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use pyarrow for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + ''' + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_skew(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 0.0 + 0.0 + 0.381802 + 0.0 + ] + + ''' + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/dataframe/frame deleted file mode 100644 index 4c0dd9a..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/dataframe/frame +++ /dev/null @@ -1,280 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from io import BytesIO, IOBase -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr -from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: set[str] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @columns.setter - def columns(self, names: Sequence[str]) -> None: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: ... - def _div(self, other: Any, floordiv: bool) -> Self: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: Iterator[str] | Sequence[str] | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy[Self]: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> RollingGroupBy[Self]: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> DynamicGroupBy[Self]: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ...) -> Self: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> Self: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: Expr | int | float | None) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> Self: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ...) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> Self: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs): ... - def merge_sorted(self, other: DataFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/dataframe/frame.pyi new file mode 100644 index 0000000..d37d938 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/dataframe/frame.pyi @@ -0,0 +1,5636 @@ +#: version 0.17.13 +import P +import np as np +import pa as pa +import pd as pd +from _io import BytesIO + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr +from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> Self: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to append to or create in the SQL database. + connection_uri + Connection uri, for example + + * "postgresql://username:password@server:port/database" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional generator/iterator that yields column names. Will be used to + replace the columns in the DataFrame. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic* + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + - \'window\': Truncate the start of the window with the \'every\' argument. + - \'datapoint\': Start from the first encountered data point. + - \'monday\': Start the window on the monday before the first data point. + - \'tuesday\': Start the window on the tuesday before the first data point. + - ... + - \'sunday\': Start the window on the sunday before the first data point. + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + It is better to implement this with an expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a Series by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, df: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + df + DataFrame to stack. + in_place + Modify in place + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: Self) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this + `DataFrame` `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. + For instance during online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a + query. For instance when you read in multiple files and when to store them in a + single `DataFrame`. In the latter case, finish the sequence of `vstack` + operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current DataFrame, with zero to \'n\' rows. + + Returns a DataFrame with identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> Self: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function : {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + A predefined aggregate function str or an expression. + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot( + ... values="baz", index="foo", columns="bar", aggregate_function="first" + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.arange(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ E ┆ 4 │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, by: str | Iterable[str], *more_by: str) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialises all frame data as a list of rows. + item: Return dataframe element as a scalar. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.rows() + [(1, 2), (3, 4), (5, 6)] + >>> df.rows(named=True) + [{\'a\': 1, \'b\': 2}, {\'a\': 3, \'b\': 4}, {\'a\': 5, \'b\': 6}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters in your use-case + you should export to a different format. + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + An iterator of tuples (default) or dictionaries (if named) of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> Self: + ''' + Return Pearson product-moment correlation coefficients. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + keyword arguments are passed to numpy corrcoef + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/expr/expr deleted file mode 100644 index 18c4bc1..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/expr/expr +++ /dev/null @@ -1,249 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: set[str] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: Expr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: int | float | bool | str | Expr | list[Any], *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: Expr | str) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def arr(self) -> ExprListNameSpace: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/expr/expr.pyi new file mode 100644 index 0000000..4b7e289 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/expr/expr.pyi @@ -0,0 +1,6186 @@ +#: version 0.17.13 +import P +import np as np +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def all(self) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + pl.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the output of an expression. + + Parameters + ---------- + name + New name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + >>> df.select( + ... pl.col("a").alias("bar"), + ... pl.col("b").alias("foo"), + ... ) + shape: (3, 2) + ┌─────┬──────┐ + │ bar ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + + Keep original column name to undo an alias operation. + + >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent + "DuplicateError: Column with name: \'literal\' has more than one occurrences" + errors. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to root column name. + + See Also + -------- + alias + map_alias + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().prefix("reverse_"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to root column name. + + See Also + -------- + alias + map_alias + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps root name to new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [3, 4], + ... } + ... ) + + >>> df.select(pl.all().reverse().suffix("_reverse")).with_columns( + ... pl.all().map_alias( + ... # Remove "_reverse" suffix and convert to lower case. + ... lambda col_name: col_name.rsplit("_reverse", 1)[0].lower() + ... ) + ... ) + shape: (2, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 2 ┆ 4 ┆ 2 ┆ 4 │ + │ 1 ┆ 3 ┆ 1 ┆ 3 │ + └───────────┴───────────┴─────┴─────┘ + + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: Expr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └─────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: int | float | bool | str | Expr | list[Any]) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.fill_null(strategy="zero") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ 0 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(99) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ 99 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(strategy="forward") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.fill_nan("zero") + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪══════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ zero │ + │ zero ┆ 6.0 │ + └──────┴──────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 6 │ + │ null ┆ 6 │ + └──────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + See Also + -------- + map_dict + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + ExprStringNameSpace.explode : Explode a string column. + + """ + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of logical exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: Expr | str) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill nulls with linear interpolation over missing values. + + Can also be used to regrid data to a new grid - see examples below. + + Parameters + ---------- + method : {\'linear\', \'linear\'} + Interpolation method + + Examples + -------- + >>> # Fill nulls with linear interpolation + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_min(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + └──────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_max(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 8.0, 6.0, 2.0, 16.0, 10.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_mean(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.5 │ + │ 7.0 │ + │ 4.0 │ + │ 9.0 │ + │ 13.0 │ + └──────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_sum(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 3.0 │ + │ 5.0 │ + │ 7.0 │ + │ 9.0 │ + │ 11.0 │ + └──────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_std(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 1.527525 │ + │ 2.0 │ + └──────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_var(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 2.333333 │ + │ 4.0 │ + └──────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_median(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_quantile(quantile=0.33, window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + └──────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + """ + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + """ + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + This can actually hurt performance and can have a lot of contention. + It is advised not to use it until actually benchmarked on your problem. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/lazyframe/frame deleted file mode 100644 index a28dae7..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/lazyframe/frame +++ /dev/null @@ -1,129 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr, Series as Series -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath -from typing import Any, Callable, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: set[str] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_plan(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_optimized_plan(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy[Self]: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> LazyGroupBy[Self]: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> LazyGroupBy[Self]: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other): ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..901a1cb --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/lazyframe/frame.pyi @@ -0,0 +1,3317 @@ +#: version 0.17.13 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> Self: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def describe_plan(self) -> str: + ''' + Create a string representation of the unoptimized query plan. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + .. deprecated:: 0.16.10 + Use ``LazyFrame.explain`` + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).describe_plan() # doctest: +SKIP + + ''' + def describe_optimized_plan(self) -> str: + """Create a string representation of the optimized query plan.""" + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.groupby_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + * \'datapoint\': Start from the first encountered data point. + * \'monday\': Start the window on the monday before the first data point. + * \'tuesday\': Start the window on the tuesday before the first data point. + * ... + * \'sunday\': Start the window on the sunday before the first data point. + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_rolling + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/series/series deleted file mode 100644 index 7266a1b..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/series/series +++ /dev/null @@ -1,327 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Generator, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: set[str] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - @classmethod - def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Self: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Self: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Self: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Self: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Self: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - def __floordiv__(self, other): ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, maintain_order: bool = ...) -> DataFrame: ... - def qcut(self, quantiles: list[float], *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - @property - def arr(self) -> ListNameSpace: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/series/series.pyi new file mode 100644 index 0000000..2fa02c8 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.13/polars/series/series.pyi @@ -0,0 +1,4036 @@ +#: version 0.17.13 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + @classmethod + def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self | Expr: ... + def __ne__(self, other: Any) -> Self | Expr: ... + def __gt__(self, other: Any) -> Self | Expr: ... + def __lt__(self, other: Any) -> Self | Expr: ... + def __ge__(self, other: Any) -> Self | Expr: ... + def __le__(self, other: Any) -> Self | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def any(self) -> bool: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self) -> bool: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ + │ 25% ┆ 2.0 │ + │ 75% ┆ 4.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ...) -> DataFrame: + ''' + Bin values into discrete values. + + Parameters + ---------- + bins + Bins to create. + labels + Labels to assign to the bins. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1]) + shape: (12, 3) + ┌──────┬─────────────┬──────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪══════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ + │ 1.5 ┆ inf ┆ (1.0, inf] │ + │ 2.0 ┆ inf ┆ (1.0, inf] │ + │ 2.5 ┆ inf ┆ (1.0, inf] │ + └──────┴─────────────┴──────────────┘ + + ''' + def qcut(self, quantiles: list[float]) -> DataFrame: + ''' + Bin values into discrete values based on their quantiles. + + Parameters + ---------- + quantiles + Quaniles to create. + We expect quantiles ``0.0 <= quantile <= 1`` + labels + Labels to assign to the quantiles. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut([0.0, 0.25, 0.75]) + shape: (8, 3) + ┌──────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪═══════════════╡ + │ -5.0 ┆ -5.0 ┆ (-inf, -5.0] │ + │ -4.0 ┆ -3.25 ┆ (-5.0, -3.25] │ + │ -3.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1.0 ┆ inf ┆ (0.25, inf] │ + │ 2.0 ┆ inf ┆ (0.25, inf] │ + └──────┴─────────────┴───────────────┘ + + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Return a copy of the Series with a new alias/name. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> srs = pl.Series("x", [1, 2, 3]) + >>> new_aliased_srs = srs.alias("y") + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Series: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.append(s2) + shape: (6,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ListNameSpace.explode : Explode a list column. + StringNameSpace.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + ''' + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_skew(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 0.0 + 0.0 + 0.381802 + 0.0 + ] + + ''' + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/dataframe/frame deleted file mode 100644 index 4c0dd9a..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/dataframe/frame +++ /dev/null @@ -1,280 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from io import BytesIO, IOBase -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr -from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: set[str] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @columns.setter - def columns(self, names: Sequence[str]) -> None: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: ... - def _div(self, other: Any, floordiv: bool) -> Self: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: Iterator[str] | Sequence[str] | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy[Self]: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> RollingGroupBy[Self]: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> DynamicGroupBy[Self]: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ...) -> Self: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> Self: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: Expr | int | float | None) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> Self: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ...) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> Self: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs): ... - def merge_sorted(self, other: DataFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/dataframe/frame.pyi new file mode 100644 index 0000000..6e4be53 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/dataframe/frame.pyi @@ -0,0 +1,5636 @@ +#: version 0.17.14 +import P +import np as np +import pa as pa +import pd as pd +from _io import BytesIO + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr +from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> Self: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to append to or create in the SQL database. + connection_uri + Connection uri, for example + + * "postgresql://username:password@server:port/database" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional generator/iterator that yields column names. Will be used to + replace the columns in the DataFrame. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic* + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + - \'window\': Truncate the start of the window with the \'every\' argument. + - \'datapoint\': Start from the first encountered data point. + - \'monday\': Start the window on the monday before the first data point. + - \'tuesday\': Start the window on the tuesday before the first data point. + - ... + - \'sunday\': Start the window on the sunday before the first data point. + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + It is better to implement this with an expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a Series by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, df: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + df + DataFrame to stack. + in_place + Modify in place + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: Self) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this + `DataFrame` `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. + For instance during online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a + query. For instance when you read in multiple files and when to store them in a + single `DataFrame`. In the latter case, finish the sequence of `vstack` + operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current DataFrame, with zero to \'n\' rows. + + Returns a DataFrame with identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> Self: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function : {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + A predefined aggregate function str or an expression. + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot( + ... values="baz", index="foo", columns="bar", aggregate_function="first" + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.arange(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ E ┆ 4 │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, by: str | Iterable[str], *more_by: str) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialises all frame data as a list of rows. + item: Return dataframe element as a scalar. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.rows() + [(1, 2), (3, 4), (5, 6)] + >>> df.rows(named=True) + [{\'a\': 1, \'b\': 2}, {\'a\': 3, \'b\': 4}, {\'a\': 5, \'b\': 6}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters in your use-case + you should export to a different format. + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + An iterator of tuples (default) or dictionaries (if named) of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> Self: + ''' + Return Pearson product-moment correlation coefficients. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + keyword arguments are passed to numpy corrcoef + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/expr/expr deleted file mode 100644 index 18c4bc1..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/expr/expr +++ /dev/null @@ -1,249 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: set[str] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: Expr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: int | float | bool | str | Expr | list[Any], *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: Expr | str) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def arr(self) -> ExprListNameSpace: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/expr/expr.pyi new file mode 100644 index 0000000..7486be3 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/expr/expr.pyi @@ -0,0 +1,6186 @@ +#: version 0.17.14 +import P +import np as np +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def all(self) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + pl.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the output of an expression. + + Parameters + ---------- + name + New name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + >>> df.select( + ... pl.col("a").alias("bar"), + ... pl.col("b").alias("foo"), + ... ) + shape: (3, 2) + ┌─────┬──────┐ + │ bar ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + + Keep original column name to undo an alias operation. + + >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent + "DuplicateError: Column with name: \'literal\' has more than one occurrences" + errors. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to root column name. + + See Also + -------- + alias + map_alias + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().prefix("reverse_"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to root column name. + + See Also + -------- + alias + map_alias + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps root name to new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [3, 4], + ... } + ... ) + + >>> df.select(pl.all().reverse().suffix("_reverse")).with_columns( + ... pl.all().map_alias( + ... # Remove "_reverse" suffix and convert to lower case. + ... lambda col_name: col_name.rsplit("_reverse", 1)[0].lower() + ... ) + ... ) + shape: (2, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 2 ┆ 4 ┆ 2 ┆ 4 │ + │ 1 ┆ 3 ┆ 1 ┆ 3 │ + └───────────┴───────────┴─────┴─────┘ + + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: Expr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └─────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: int | float | bool | str | Expr | list[Any]) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.fill_null(strategy="zero") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ 0 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(99) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ 99 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(strategy="forward") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.fill_nan("zero") + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪══════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ zero │ + │ zero ┆ 6.0 │ + └──────┴──────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 6 │ + │ null ┆ 6 │ + └──────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + See Also + -------- + map_dict + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + ExprStringNameSpace.explode : Explode a string column. + + """ + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of logical exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: Expr | str) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill nulls with linear interpolation over missing values. + + Can also be used to regrid data to a new grid - see examples below. + + Parameters + ---------- + method : {\'linear\', \'linear\'} + Interpolation method + + Examples + -------- + >>> # Fill nulls with linear interpolation + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_min(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + └──────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_max(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 8.0, 6.0, 2.0, 16.0, 10.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_mean(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.5 │ + │ 7.0 │ + │ 4.0 │ + │ 9.0 │ + │ 13.0 │ + └──────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_sum(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 3.0 │ + │ 5.0 │ + │ 7.0 │ + │ 9.0 │ + │ 11.0 │ + └──────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_std(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 1.527525 │ + │ 2.0 │ + └──────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_var(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 2.333333 │ + │ 4.0 │ + └──────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_median(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_quantile(quantile=0.33, window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + └──────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + """ + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + """ + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + This can actually hurt performance and can have a lot of contention. + It is advised not to use it until actually benchmarked on your problem. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/lazyframe/frame deleted file mode 100644 index f87c615..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/lazyframe/frame +++ /dev/null @@ -1,130 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr, Series as Series -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath -from typing import Any, Callable, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: set[str] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_plan(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_optimized_plan(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy[Self]: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> LazyGroupBy[Self]: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> LazyGroupBy[Self]: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other): ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..740aa5a --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/lazyframe/frame.pyi @@ -0,0 +1,3341 @@ +#: version 0.17.14 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> Self: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def describe_plan(self) -> str: + ''' + Create a string representation of the unoptimized query plan. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + .. deprecated:: 0.16.10 + Use ``LazyFrame.explain`` + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).describe_plan() # doctest: +SKIP + + ''' + def describe_optimized_plan(self) -> str: + """Create a string representation of the optimized query plan.""" + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.groupby_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + * \'datapoint\': Start from the first encountered data point. + * \'monday\': Start the window on the monday before the first data point. + * \'tuesday\': Start the window on the tuesday before the first data point. + * ... + * \'sunday\': Start the window on the sunday before the first data point. + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_rolling + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/series/series deleted file mode 100644 index 7266a1b..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/series/series +++ /dev/null @@ -1,327 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Generator, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: set[str] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - @classmethod - def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Self: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Self: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Self: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Self: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Self: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - def __floordiv__(self, other): ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, maintain_order: bool = ...) -> DataFrame: ... - def qcut(self, quantiles: list[float], *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - @property - def arr(self) -> ListNameSpace: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/series/series.pyi new file mode 100644 index 0000000..fd3fa1f --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.14/polars/series/series.pyi @@ -0,0 +1,4036 @@ +#: version 0.17.14 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + @classmethod + def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self | Expr: ... + def __ne__(self, other: Any) -> Self | Expr: ... + def __gt__(self, other: Any) -> Self | Expr: ... + def __lt__(self, other: Any) -> Self | Expr: ... + def __ge__(self, other: Any) -> Self | Expr: ... + def __le__(self, other: Any) -> Self | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def any(self) -> bool: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self) -> bool: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ + │ 25% ┆ 2.0 │ + │ 75% ┆ 4.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ...) -> DataFrame: + ''' + Bin values into discrete values. + + Parameters + ---------- + bins + Bins to create. + labels + Labels to assign to the bins. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1]) + shape: (12, 3) + ┌──────┬─────────────┬──────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪══════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ + │ 1.5 ┆ inf ┆ (1.0, inf] │ + │ 2.0 ┆ inf ┆ (1.0, inf] │ + │ 2.5 ┆ inf ┆ (1.0, inf] │ + └──────┴─────────────┴──────────────┘ + + ''' + def qcut(self, quantiles: list[float]) -> DataFrame: + ''' + Bin values into discrete values based on their quantiles. + + Parameters + ---------- + quantiles + Quaniles to create. + We expect quantiles ``0.0 <= quantile <= 1`` + labels + Labels to assign to the quantiles. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut([0.0, 0.25, 0.75]) + shape: (8, 3) + ┌──────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪═══════════════╡ + │ -5.0 ┆ -5.0 ┆ (-inf, -5.0] │ + │ -4.0 ┆ -3.25 ┆ (-5.0, -3.25] │ + │ -3.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1.0 ┆ inf ┆ (0.25, inf] │ + │ 2.0 ┆ inf ┆ (0.25, inf] │ + └──────┴─────────────┴───────────────┘ + + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Return a copy of the Series with a new alias/name. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> srs = pl.Series("x", [1, 2, 3]) + >>> new_aliased_srs = srs.alias("y") + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Series: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.append(s2) + shape: (6,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ListNameSpace.explode : Explode a list column. + StringNameSpace.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + ''' + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_skew(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 0.0 + 0.0 + 0.381802 + 0.0 + ] + + ''' + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/dataframe/frame deleted file mode 100644 index a1368ff..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/dataframe/frame +++ /dev/null @@ -1,282 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Null as Null, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, Struct as Struct, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr -from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: set[str] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @columns.setter - def columns(self, names: Sequence[str]) -> None: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: ... - def _div(self, other: Any, floordiv: bool) -> Self: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy[Self]: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> RollingGroupBy[Self]: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> DynamicGroupBy[Self]: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ...) -> Self: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> Self: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: Expr | int | float | None) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> Self: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ...) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> Self: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs): ... - def merge_sorted(self, other: DataFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/dataframe/frame.pyi new file mode 100644 index 0000000..45b3350 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/dataframe/frame.pyi @@ -0,0 +1,5713 @@ +#: version 0.17.15 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Null as Null, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, Struct as Struct, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr +from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> Self: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to append to or create in the SQL database. + connection_uri + Connection uri, for example + + * "postgresql://username:password@server:port/database" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Note: Some polars data types like `Null`, `Categorical` and `Time` are + not supported by the delta protocol specification. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Examples + -------- + Instantiate a basic dataframe: + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + + Write DataFrame as a Delta Lake table on local filesystem. + + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on local filesystem. + Note: This will fail if schema of the new data does not match the + schema of existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + Note: If the schema of the new and old data is same, + then setting `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table on cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable that yields column names. Will be used to + replace the columns in the DataFrame. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic* + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + - \'window\': Truncate the start of the window with the \'every\' argument. + - \'datapoint\': Start from the first encountered data point. + - \'monday\': Start the window on the monday before the first data point. + - \'tuesday\': Start the window on the tuesday before the first data point. + - ... + - \'sunday\': Start the window on the sunday before the first data point. + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + It is better to implement this with an expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a Series by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, df: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + df + DataFrame to stack. + in_place + Modify in place + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: Self) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this + `DataFrame` `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. + For instance during online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a + query. For instance when you read in multiple files and when to store them in a + single `DataFrame`. In the latter case, finish the sequence of `vstack` + operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current DataFrame, with zero to \'n\' rows. + + Returns a DataFrame with identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> Self: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function : {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + A predefined aggregate function str or an expression. + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot( + ... values="baz", index="foo", columns="bar", aggregate_function="first" + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.arange(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ E ┆ 4 │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, by: str | Iterable[str], *more_by: str) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialises all frame data as a list of rows. + item: Return dataframe element as a scalar. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.rows() + [(1, 2), (3, 4), (5, 6)] + >>> df.rows(named=True) + [{\'a\': 1, \'b\': 2}, {\'a\': 3, \'b\': 4}, {\'a\': 5, \'b\': 6}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters in your use-case + you should export to a different format. + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + An iterator of tuples (default) or dictionaries (if named) of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> Self: + ''' + Return Pearson product-moment correlation coefficients. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + keyword arguments are passed to numpy corrcoef + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/expr/expr deleted file mode 100644 index 18c4bc1..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/expr/expr +++ /dev/null @@ -1,249 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: set[str] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: Expr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: int | float | bool | str | Expr | list[Any], *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: Expr | str) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def arr(self) -> ExprListNameSpace: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/expr/expr.pyi new file mode 100644 index 0000000..922d500 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/expr/expr.pyi @@ -0,0 +1,6203 @@ +#: version 0.17.15 +import P +import np as np +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def all(self) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + pl.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the output of an expression. + + Parameters + ---------- + name + New name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + >>> df.select( + ... pl.col("a").alias("bar"), + ... pl.col("b").alias("foo"), + ... ) + shape: (3, 2) + ┌─────┬──────┐ + │ bar ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + + Keep original column name to undo an alias operation. + + >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent + "DuplicateError: Column with name: \'literal\' has more than one occurrences" + errors. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to root column name. + + See Also + -------- + alias + map_alias + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().prefix("reverse_"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to root column name. + + See Also + -------- + alias + map_alias + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps root name to new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [3, 4], + ... } + ... ) + + >>> df.select(pl.all().reverse().suffix("_reverse")).with_columns( + ... pl.all().map_alias( + ... # Remove "_reverse" suffix and convert to lower case. + ... lambda col_name: col_name.rsplit("_reverse", 1)[0].lower() + ... ) + ... ) + shape: (2, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 2 ┆ 4 ┆ 2 ┆ 4 │ + │ 1 ┆ 3 ┆ 1 ┆ 3 │ + └───────────┴───────────┴─────┴─────┘ + + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: Expr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: int | float | bool | str | Expr | list[Any]) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.fill_null(strategy="zero") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ 0 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(99) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ 99 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(strategy="forward") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.fill_nan("zero") + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪══════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ zero │ + │ zero ┆ 6.0 │ + └──────┴──────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 6 │ + │ null ┆ 6 │ + └──────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + See Also + -------- + map_dict + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + ExprStringNameSpace.explode : Explode a string column. + + """ + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of logical exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: Expr | str) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill nulls with linear interpolation over missing values. + + Can also be used to regrid data to a new grid - see examples below. + + Parameters + ---------- + method : {\'linear\', \'linear\'} + Interpolation method + + Examples + -------- + >>> # Fill nulls with linear interpolation + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_min(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + └──────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_max(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 8.0, 6.0, 2.0, 16.0, 10.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_mean(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.5 │ + │ 7.0 │ + │ 4.0 │ + │ 9.0 │ + │ 13.0 │ + └──────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_sum(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 3.0 │ + │ 5.0 │ + │ 7.0 │ + │ 9.0 │ + │ 11.0 │ + └──────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_std(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 1.527525 │ + │ 2.0 │ + └──────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_var(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 2.333333 │ + │ 4.0 │ + └──────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_median(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_quantile(quantile=0.33, window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + └──────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + """ + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + """ + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f32 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + This can actually hurt performance and can have a lot of contention. + It is advised not to use it until actually benchmarked on your problem. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/lazyframe/frame deleted file mode 100644 index 4173bfa..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/lazyframe/frame +++ /dev/null @@ -1,130 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr, Series as Series -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath -from typing import Any, Callable, Collection, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: set[str] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_plan(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_optimized_plan(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy[Self]: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> LazyGroupBy[Self]: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> LazyGroupBy[Self]: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other): ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..948088c --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/lazyframe/frame.pyi @@ -0,0 +1,3341 @@ +#: version 0.17.15 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> Self: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def describe_plan(self) -> str: + ''' + Create a string representation of the unoptimized query plan. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + .. deprecated:: 0.16.10 + Use ``LazyFrame.explain`` + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).describe_plan() # doctest: +SKIP + + ''' + def describe_optimized_plan(self) -> str: + """Create a string representation of the optimized query plan.""" + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.groupby_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + * \'datapoint\': Start from the first encountered data point. + * \'monday\': Start the window on the monday before the first data point. + * \'tuesday\': Start the window on the tuesday before the first data point. + * ... + * \'sunday\': Start the window on the sunday before the first data point. + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_rolling + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/series/series deleted file mode 100644 index 86ea01c..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/series/series +++ /dev/null @@ -1,325 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, find_stacklevel as find_stacklevel, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Generator, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: set[str] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Self: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Self: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Self: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Self: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Self: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - def __floordiv__(self, other): ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, maintain_order: bool = ...) -> DataFrame: ... - def qcut(self, quantiles: list[float], *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool | None = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - @property - def arr(self) -> ListNameSpace: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/series/series.pyi new file mode 100644 index 0000000..25fd7f2 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.15/polars/series/series.pyi @@ -0,0 +1,4034 @@ +#: version 0.17.15 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, find_stacklevel as find_stacklevel, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self | Expr: ... + def __ne__(self, other: Any) -> Self | Expr: ... + def __gt__(self, other: Any) -> Self | Expr: ... + def __lt__(self, other: Any) -> Self | Expr: ... + def __ge__(self, other: Any) -> Self | Expr: ... + def __le__(self, other: Any) -> Self | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def any(self) -> bool: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self) -> bool: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ + │ 25% ┆ 2.0 │ + │ 75% ┆ 4.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ...) -> DataFrame: + ''' + Bin values into discrete values. + + Parameters + ---------- + bins + Bins to create. + labels + Labels to assign to the bins. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1]) + shape: (12, 3) + ┌──────┬─────────────┬──────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪══════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ + │ 1.5 ┆ inf ┆ (1.0, inf] │ + │ 2.0 ┆ inf ┆ (1.0, inf] │ + │ 2.5 ┆ inf ┆ (1.0, inf] │ + └──────┴─────────────┴──────────────┘ + + ''' + def qcut(self, quantiles: list[float]) -> DataFrame: + ''' + Bin values into discrete values based on their quantiles. + + Parameters + ---------- + quantiles + Quaniles to create. + We expect quantiles ``0.0 <= quantile <= 1`` + labels + Labels to assign to the quantiles. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut([0.0, 0.25, 0.75]) + shape: (8, 3) + ┌──────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪═══════════════╡ + │ -5.0 ┆ -5.0 ┆ (-inf, -5.0] │ + │ -4.0 ┆ -3.25 ┆ (-5.0, -3.25] │ + │ -3.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1.0 ┆ inf ┆ (0.25, inf] │ + │ 2.0 ┆ inf ┆ (0.25, inf] │ + └──────┴─────────────┴───────────────┘ + + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Return a copy of the Series with a new alias/name. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> srs = pl.Series("x", [1, 2, 3]) + >>> new_aliased_srs = srs.alias("y") + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Series: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.append(s2) + shape: (6,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ListNameSpace.explode : Explode a list column. + StringNameSpace.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + ''' + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_skew(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 0.0 + 0.0 + 0.381802 + 0.0 + ] + + ''' + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/dataframe/frame deleted file mode 100644 index 7e1857d..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/dataframe/frame +++ /dev/null @@ -1,285 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from io import BytesIO, IOBase -from pathlib import Path -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.expr import Expr as Expr -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.lazyframe import LazyFrame as LazyFrame -from polars.polars import PyDataFrame as PyDataFrame -from polars.series import Series as Series -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr -from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: set[str] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @columns.setter - def columns(self, names: Sequence[str]) -> None: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: ... - def _div(self, other: Any, floordiv: bool) -> Self: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int]) -> Series: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, str]) -> Series: ... - @overload - def __getitem__(self, item: tuple[int, int]) -> Any: ... - @overload - def __getitem__(self, item: tuple[int, str]) -> Any: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: Iterator[str] | Sequence[str] | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy[Self]: ... - def groupby_rolling(self, index_column: str, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> RollingGroupBy[Self]: ... - def groupby_dynamic(self, index_column: str, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> DynamicGroupBy[Self]: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ...) -> Self: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> Self: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: Expr | int | float | None) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> Self: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ...) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> Self: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs): ... - def merge_sorted(self, other: DataFrame, key: str) -> Self: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/dataframe/frame.pyi new file mode 100644 index 0000000..b0bceca --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/dataframe/frame.pyi @@ -0,0 +1,5549 @@ +#: version 0.17.3 +import P +import np as np +import pa as pa +import pd as pd +from _io import BytesIO + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Object as Object, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr +from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +FLOAT_DTYPES: frozenset +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> Self: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int] | tuple[MultiRowSelector, str] | tuple[int, int] | tuple[int, str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self) -> Any: + ''' + Return the dataframe as a scalar. + + Equivalent to ``df[0,0]``, with a check that the shape is (1,1). + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> result = df.select((pl.col("a") * pl.col("b")).sum()) + >>> result + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 32 │ + └─────┘ + >>> result.item() + 32 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> numpy_array = df.to_numpy() + >>> type(numpy_array) + + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to append to or create in the SQL database. + connection_uri + Connection uri, for example + + * "postgresql://username:password@server:port/database" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional generator/iterator that yields column names. Will be used to + replace the columns in the DataFrame. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: str) -> RollingGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic* + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime) + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: str) -> DynamicGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + - \'window\': Truncate the start of the window with the \'every\' argument. + - \'datapoint\': Start from the first encountered data point. + - \'monday\': Start the window on the monday before the first data point. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ) + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ) + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ) + >>> population.join_asof( + ... gdp, left_on="date", right_on="date", strategy="backward" + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + It is better to implement this with an expression: + + >>> df.select([pl.col("foo") * 2, pl.col("bar") * 3]) # doctest: +IGNORE_RESULT + + Return a Series by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, df: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + df + DataFrame to stack. + in_place + Modify in place + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: Self) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this + `DataFrame` `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. + For instance during online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a + query. For instance when you read in multiple files and when to store them in a + single `DataFrame`. In the latter case, finish the sequence of `vstack` + operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current DataFrame, with zero to \'n\' rows. + + Returns a DataFrame with identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> Self: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function : {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + A predefined aggregate function str or an expression. + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot( + ... values="baz", index="foo", columns="bar", aggregate_function="first" + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Run an expression as aggregation function + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.arange(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ E ┆ 4 │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, by: str | Iterable[str], *more_by: str) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you absolutely + require row-iteration you should strongly prefer ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialises all frame data as a list of rows. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.rows() + [(1, 2), (3, 4), (5, 6)] + >>> df.rows(named=True) + [{\'a\': 1, \'b\': 2}, {\'a\': 3, \'b\': 4}, {\'a\': 5, \'b\': 6}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters in your use-case + you should export to a different format. + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + An iterator of tuples (default) or dictionaries (if named) of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> Self: + ''' + Return Pearson product-moment correlation coefficients. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + keyword arguments are passed to numpy corrcoef + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/expr/expr deleted file mode 100644 index 69228f9..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/expr/expr +++ /dev/null @@ -1,250 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.lazyframe import LazyFrame as LazyFrame -from polars.polars import PyExpr as PyExpr -from polars.series import Series as Series -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import find_stacklevel as find_stacklevel, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: set[str] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: Expr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: int | float | bool | str | Expr | list[Any], *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: Expr | str) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def arr(self) -> ExprListNameSpace: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/expr/expr.pyi new file mode 100644 index 0000000..fb4b44e --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/expr/expr.pyi @@ -0,0 +1,6095 @@ +#: version 0.17.3 +import P +import np as np +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import find_stacklevel as find_stacklevel, sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +FLOAT_DTYPES: frozenset +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def all(self) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + pl.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the output of an expression. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + >>> df.select( + ... [ + ... pl.col("a").alias("bar"), + ... pl.col("b").alias("foo"), + ... ] + ... ) + shape: (3, 2) + ┌─────┬──────┐ + │ bar ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + + Keep original column name to undo an alias operation. + + >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent + "DuplicateError: Column with name: \'literal\' has more than one occurrences" + errors. + + >>> df.select([(pl.lit(10) / pl.all()).keep_name()]) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr): + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + ... + >>> df = pl.DataFrame({"a": ["a: 1", "b: 2", "c: 3"]}) + >>> df.with_columns(pl.col("a").pipe(extract_number)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().prefix("reverse_"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps root name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [3, 4], + ... } + ... ) + >>> df.select( + ... pl.all().reverse().map_alias(lambda colName: colName + "_reverse") + ... ) + shape: (2, 2) + ┌───────────┬───────────┐ + │ A_reverse ┆ B_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════════╪═══════════╡ + │ 2 ┆ 4 │ + │ 1 ┆ 3 │ + └───────────┴───────────┘ + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: Expr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └─────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: int | float | bool | str | Expr | list[Any]) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.fill_null(strategy="zero") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ 0 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(99) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ 99 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(strategy="forward") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.fill_nan("zero") + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪══════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ zero │ + │ zero ┆ 6.0 │ + └──────┴──────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 6 │ + │ null ┆ 6 │ + └──────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + .. deprecated:: 0.15.16 + `Expr.explode` will be removed in favour of `Expr.arr.explode` and + `Expr.str.explode`. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + ExprStringNameSpace.explode : Explode a string column. + + """ + def implode(self) -> Self: + ''' + Aggregate all column values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of logical exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: Expr | str) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill nulls with linear interpolation over missing values. + + Can also be used to regrid data to a new grid - see examples below. + + Parameters + ---------- + method : {\'linear\', \'linear\'} + Interpolation method + + Examples + -------- + >>> # Fill nulls with linear interpolation + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_min(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + └──────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_max(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 8.0, 6.0, 2.0, 16.0, 10.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_mean(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.5 │ + │ 7.0 │ + │ 4.0 │ + │ 9.0 │ + │ 13.0 │ + └──────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_sum(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 3.0 │ + │ 5.0 │ + │ 7.0 │ + │ 9.0 │ + │ 11.0 │ + └──────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_std(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 1.527525 │ + │ 2.0 │ + └──────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_var(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 2.333333 │ + │ 4.0 │ + └──────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_median(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_quantile(quantile=0.33, window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + └──────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + """ + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + """ + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/lazyframe/frame deleted file mode 100644 index 1a8c8b5..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/lazyframe/frame +++ /dev/null @@ -1,130 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.expr.expr import Expr as Expr -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.series import Series as Series -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, PolarsExprType as PolarsExprType, PythonLiteral as PythonLiteral, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath -from typing import Any, Callable, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: set[str] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_plan(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_optimized_plan(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy[Self]: ... - def groupby_rolling(self, index_column: str, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> LazyGroupBy[Self]: ... - def groupby_dynamic(self, index_column: str, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> LazyGroupBy[Self]: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, exprs: str | PolarsExprType | PythonLiteral | Series | Iterable[str | PolarsExprType | PythonLiteral | Series | None] | None = ..., *more_exprs: str | PolarsExprType | PythonLiteral | Series | None, **named_exprs: str | PolarsExprType | PythonLiteral | Series | None) -> Self: ... - def with_context(self, other): ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..24d1356 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/lazyframe/frame.pyi @@ -0,0 +1,3273 @@ +#: version 0.17.3 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> Self: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def describe_plan(self) -> str: + ''' + Create a string representation of the unoptimized query plan. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + .. deprecated:: 0.16.10 + Use ``LazyFrame.explain`` + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).describe_plan() # doctest: +SKIP + + ''' + def describe_optimized_plan(self) -> str: + """Create a string representation of the optimized query plan.""" + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: str) -> LazyGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime) + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: str) -> LazyGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + * \'datapoint\': Start from the first encountered data point. + * \'monday\': Start the window on the monday before the first data point. + + See Also + -------- + groupby_rolling + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ) + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ) + >>> population.join_asof( + ... gdp, left_on="date", right_on="date", strategy="backward" + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, exprs: str | PolarsExprType | PythonLiteral | Series | Iterable[str | PolarsExprType | PythonLiteral | Series | None] | None = ..., *more_exprs: str | PolarsExprType | PythonLiteral | Series | None, **named_exprs: str | PolarsExprType | PythonLiteral | Series | None) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/series/series deleted file mode 100644 index 52d763e..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/series/series +++ /dev/null @@ -1,317 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import internals as pli -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.expr.expr import Expr as Expr -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: set[str] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - @classmethod - def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - def __floordiv__(self, other): ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __iter__(self) -> SeriesIter: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, maintain_order: bool = ...) -> DataFrame: ... - def qcut(self, quantiles: list[float], *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | pli.Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - @property - def arr(self) -> ListNameSpace: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -class SeriesIter: - len: int - i: int - s: Series - def __init__(self, length: int, s: Series) -> None: ... - def __iter__(self) -> SeriesIter: ... - def __next__(self) -> Any: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/series/series.pyi new file mode 100644 index 0000000..d8d473a --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.3/polars/series/series.pyi @@ -0,0 +1,4013 @@ +#: version 0.17.3 +import np as np +import pa as pa +import pd as pd +import pli +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar, Collection, NoReturn, Sequence + +TYPE_CHECKING: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + @classmethod + def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def le(self, other: Any) -> Self: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self: + """Method equivalent of operator expression ``series == other``.""" + def ne(self, other: Any) -> Self: + """Method equivalent of operator expression ``series != other``.""" + def ge(self, other: Any) -> Self: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __iter__(self) -> SeriesIter: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self) -> Any: + ''' + Return the series as a scalar. + + Equivalent to ``s[0]``, with a check that the shape is (1,). + + Examples + -------- + >>> s = pl.Series("a", [1]) + >>> s.item() + 1 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def any(self) -> bool: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self) -> bool: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ + │ 25% ┆ 2.0 │ + │ 75% ┆ 4.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ...) -> DataFrame: + ''' + Bin values into discrete values. + + Parameters + ---------- + bins + Bins to create. + labels + Labels to assign to the bins. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1]) + shape: (12, 3) + ┌──────┬─────────────┬──────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪══════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ + │ 1.5 ┆ inf ┆ (1.0, inf] │ + │ 2.0 ┆ inf ┆ (1.0, inf] │ + │ 2.5 ┆ inf ┆ (1.0, inf] │ + └──────┴─────────────┴──────────────┘ + + ''' + def qcut(self, quantiles: list[float]) -> DataFrame: + ''' + Bin values into discrete values based on their quantiles. + + Parameters + ---------- + quantiles + Quaniles to create. + We expect quantiles ``0.0 <= quantile <= 1`` + labels + Labels to assign to the quantiles. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut([0.0, 0.25, 0.75]) + shape: (8, 3) + ┌──────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪═══════════════╡ + │ -5.0 ┆ -5.0 ┆ (-inf, -5.0] │ + │ -4.0 ┆ -3.25 ┆ (-5.0, -3.25] │ + │ -3.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1.0 ┆ inf ┆ (0.25, inf] │ + │ 2.0 ┆ inf ┆ (0.25, inf] │ + └──────┴─────────────┴───────────────┘ + + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Return a copy of the Series with a new alias/name. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> srs = pl.Series("x", [1, 2, 3]) + >>> new_aliased_srs = srs.alias("y") + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Series: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.append(s2) + shape: (6,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + .. deprecated:: 0.15.16 + `Series.explode` will be removed in favour of `Series.arr.explode` and + `Series.str.explode`. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ListNameSpace.explode : Explode a list column. + StringNameSpace.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use pyarrow for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | pli.Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + ''' + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_skew(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 0.0 + 0.0 + 0.381802 + 0.0 + ] + + ''' + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def str(self): ... + @property + def struct(self): ... + +class SeriesIter: + def __init__(self, length: int, s: Series) -> None: ... + def __iter__(self) -> SeriesIter: ... + def __next__(self) -> Any: ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/dataframe/frame deleted file mode 100644 index 7e1857d..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/dataframe/frame +++ /dev/null @@ -1,285 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from io import BytesIO, IOBase -from pathlib import Path -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.expr import Expr as Expr -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.lazyframe import LazyFrame as LazyFrame -from polars.polars import PyDataFrame as PyDataFrame -from polars.series import Series as Series -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr -from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: set[str] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @columns.setter - def columns(self, names: Sequence[str]) -> None: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: ... - def _div(self, other: Any, floordiv: bool) -> Self: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int]) -> Series: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, str]) -> Series: ... - @overload - def __getitem__(self, item: tuple[int, int]) -> Any: ... - @overload - def __getitem__(self, item: tuple[int, str]) -> Any: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: Iterator[str] | Sequence[str] | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy[Self]: ... - def groupby_rolling(self, index_column: str, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> RollingGroupBy[Self]: ... - def groupby_dynamic(self, index_column: str, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> DynamicGroupBy[Self]: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ...) -> Self: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> Self: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: Expr | int | float | None) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> Self: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ...) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> Self: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs): ... - def merge_sorted(self, other: DataFrame, key: str) -> Self: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/dataframe/frame.pyi new file mode 100644 index 0000000..d14e332 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/dataframe/frame.pyi @@ -0,0 +1,5549 @@ +#: version 0.17.5 +import P +import np as np +import pa as pa +import pd as pd +from _io import BytesIO + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Object as Object, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr +from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +FLOAT_DTYPES: frozenset +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> Self: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int] | tuple[MultiRowSelector, str] | tuple[int, int] | tuple[int, str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self) -> Any: + ''' + Return the dataframe as a scalar. + + Equivalent to ``df[0,0]``, with a check that the shape is (1,1). + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> result = df.select((pl.col("a") * pl.col("b")).sum()) + >>> result + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 32 │ + └─────┘ + >>> result.item() + 32 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> numpy_array = df.to_numpy() + >>> type(numpy_array) + + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to append to or create in the SQL database. + connection_uri + Connection uri, for example + + * "postgresql://username:password@server:port/database" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional generator/iterator that yields column names. Will be used to + replace the columns in the DataFrame. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: str) -> RollingGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic* + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime) + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: str) -> DynamicGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + - \'window\': Truncate the start of the window with the \'every\' argument. + - \'datapoint\': Start from the first encountered data point. + - \'monday\': Start the window on the monday before the first data point. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ) + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ) + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ) + >>> population.join_asof( + ... gdp, left_on="date", right_on="date", strategy="backward" + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + It is better to implement this with an expression: + + >>> df.select([pl.col("foo") * 2, pl.col("bar") * 3]) # doctest: +IGNORE_RESULT + + Return a Series by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, df: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + df + DataFrame to stack. + in_place + Modify in place + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: Self) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this + `DataFrame` `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. + For instance during online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a + query. For instance when you read in multiple files and when to store them in a + single `DataFrame`. In the latter case, finish the sequence of `vstack` + operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current DataFrame, with zero to \'n\' rows. + + Returns a DataFrame with identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> Self: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function : {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + A predefined aggregate function str or an expression. + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot( + ... values="baz", index="foo", columns="bar", aggregate_function="first" + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Run an expression as aggregation function + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.arange(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ E ┆ 4 │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, by: str | Iterable[str], *more_by: str) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you absolutely + require row-iteration you should strongly prefer ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialises all frame data as a list of rows. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.rows() + [(1, 2), (3, 4), (5, 6)] + >>> df.rows(named=True) + [{\'a\': 1, \'b\': 2}, {\'a\': 3, \'b\': 4}, {\'a\': 5, \'b\': 6}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters in your use-case + you should export to a different format. + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + An iterator of tuples (default) or dictionaries (if named) of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> Self: + ''' + Return Pearson product-moment correlation coefficients. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + keyword arguments are passed to numpy corrcoef + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/expr/expr deleted file mode 100644 index 69228f9..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/expr/expr +++ /dev/null @@ -1,250 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.lazyframe import LazyFrame as LazyFrame -from polars.polars import PyExpr as PyExpr -from polars.series import Series as Series -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import find_stacklevel as find_stacklevel, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: set[str] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: Expr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: int | float | bool | str | Expr | list[Any], *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: Expr | str) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def arr(self) -> ExprListNameSpace: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/expr/expr.pyi new file mode 100644 index 0000000..5b3bd87 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/expr/expr.pyi @@ -0,0 +1,6095 @@ +#: version 0.17.5 +import P +import np as np +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import find_stacklevel as find_stacklevel, sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +FLOAT_DTYPES: frozenset +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def all(self) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + pl.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the output of an expression. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + >>> df.select( + ... [ + ... pl.col("a").alias("bar"), + ... pl.col("b").alias("foo"), + ... ] + ... ) + shape: (3, 2) + ┌─────┬──────┐ + │ bar ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + + Keep original column name to undo an alias operation. + + >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent + "DuplicateError: Column with name: \'literal\' has more than one occurrences" + errors. + + >>> df.select([(pl.lit(10) / pl.all()).keep_name()]) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr): + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + ... + >>> df = pl.DataFrame({"a": ["a: 1", "b: 2", "c: 3"]}) + >>> df.with_columns(pl.col("a").pipe(extract_number)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().prefix("reverse_"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps root name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [3, 4], + ... } + ... ) + >>> df.select( + ... pl.all().reverse().map_alias(lambda colName: colName + "_reverse") + ... ) + shape: (2, 2) + ┌───────────┬───────────┐ + │ A_reverse ┆ B_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════════╪═══════════╡ + │ 2 ┆ 4 │ + │ 1 ┆ 3 │ + └───────────┴───────────┘ + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: Expr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └─────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: int | float | bool | str | Expr | list[Any]) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.fill_null(strategy="zero") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ 0 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(99) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ 99 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(strategy="forward") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.fill_nan("zero") + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪══════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ zero │ + │ zero ┆ 6.0 │ + └──────┴──────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 6 │ + │ null ┆ 6 │ + └──────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + .. deprecated:: 0.15.16 + `Expr.explode` will be removed in favour of `Expr.arr.explode` and + `Expr.str.explode`. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + ExprStringNameSpace.explode : Explode a string column. + + """ + def implode(self) -> Self: + ''' + Aggregate all column values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of logical exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: Expr | str) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill nulls with linear interpolation over missing values. + + Can also be used to regrid data to a new grid - see examples below. + + Parameters + ---------- + method : {\'linear\', \'linear\'} + Interpolation method + + Examples + -------- + >>> # Fill nulls with linear interpolation + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_min(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + └──────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_max(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 8.0, 6.0, 2.0, 16.0, 10.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_mean(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.5 │ + │ 7.0 │ + │ 4.0 │ + │ 9.0 │ + │ 13.0 │ + └──────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_sum(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 3.0 │ + │ 5.0 │ + │ 7.0 │ + │ 9.0 │ + │ 11.0 │ + └──────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_std(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 1.527525 │ + │ 2.0 │ + └──────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_var(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 2.333333 │ + │ 4.0 │ + └──────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_median(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_quantile(quantile=0.33, window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + └──────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + """ + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + """ + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/lazyframe/frame deleted file mode 100644 index 1a8c8b5..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/lazyframe/frame +++ /dev/null @@ -1,130 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.expr.expr import Expr as Expr -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.series import Series as Series -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, PolarsExprType as PolarsExprType, PythonLiteral as PythonLiteral, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath -from typing import Any, Callable, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: set[str] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_plan(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_optimized_plan(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy[Self]: ... - def groupby_rolling(self, index_column: str, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> LazyGroupBy[Self]: ... - def groupby_dynamic(self, index_column: str, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> LazyGroupBy[Self]: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, exprs: str | PolarsExprType | PythonLiteral | Series | Iterable[str | PolarsExprType | PythonLiteral | Series | None] | None = ..., *more_exprs: str | PolarsExprType | PythonLiteral | Series | None, **named_exprs: str | PolarsExprType | PythonLiteral | Series | None) -> Self: ... - def with_context(self, other): ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..119ff42 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/lazyframe/frame.pyi @@ -0,0 +1,3273 @@ +#: version 0.17.5 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> Self: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def describe_plan(self) -> str: + ''' + Create a string representation of the unoptimized query plan. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + .. deprecated:: 0.16.10 + Use ``LazyFrame.explain`` + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).describe_plan() # doctest: +SKIP + + ''' + def describe_optimized_plan(self) -> str: + """Create a string representation of the optimized query plan.""" + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: str) -> LazyGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime) + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: str) -> LazyGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + * \'datapoint\': Start from the first encountered data point. + * \'monday\': Start the window on the monday before the first data point. + + See Also + -------- + groupby_rolling + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ) + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ) + >>> population.join_asof( + ... gdp, left_on="date", right_on="date", strategy="backward" + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, exprs: str | PolarsExprType | PythonLiteral | Series | Iterable[str | PolarsExprType | PythonLiteral | Series | None] | None = ..., *more_exprs: str | PolarsExprType | PythonLiteral | Series | None, **named_exprs: str | PolarsExprType | PythonLiteral | Series | None) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config() as cfg: + ... cfg.set_auto_structify(True) # doctest: +IGNORE_RESULT + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/series/series deleted file mode 100644 index 52d763e..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/series/series +++ /dev/null @@ -1,317 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import internals as pli -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.expr.expr import Expr as Expr -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: set[str] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - @classmethod - def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - def __floordiv__(self, other): ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __iter__(self) -> SeriesIter: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, maintain_order: bool = ...) -> DataFrame: ... - def qcut(self, quantiles: list[float], *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | pli.Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - @property - def arr(self) -> ListNameSpace: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -class SeriesIter: - len: int - i: int - s: Series - def __init__(self, length: int, s: Series) -> None: ... - def __iter__(self) -> SeriesIter: ... - def __next__(self) -> Any: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/series/series.pyi new file mode 100644 index 0000000..8f7c923 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.5/polars/series/series.pyi @@ -0,0 +1,4013 @@ +#: version 0.17.5 +import np as np +import pa as pa +import pd as pd +import pli +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar, Collection, NoReturn, Sequence + +TYPE_CHECKING: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + @classmethod + def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def le(self, other: Any) -> Self: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self: + """Method equivalent of operator expression ``series == other``.""" + def ne(self, other: Any) -> Self: + """Method equivalent of operator expression ``series != other``.""" + def ge(self, other: Any) -> Self: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __iter__(self) -> SeriesIter: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self) -> Any: + ''' + Return the series as a scalar. + + Equivalent to ``s[0]``, with a check that the shape is (1,). + + Examples + -------- + >>> s = pl.Series("a", [1]) + >>> s.item() + 1 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def any(self) -> bool: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self) -> bool: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ + │ 25% ┆ 2.0 │ + │ 75% ┆ 4.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ...) -> DataFrame: + ''' + Bin values into discrete values. + + Parameters + ---------- + bins + Bins to create. + labels + Labels to assign to the bins. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1]) + shape: (12, 3) + ┌──────┬─────────────┬──────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪══════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ + │ 1.5 ┆ inf ┆ (1.0, inf] │ + │ 2.0 ┆ inf ┆ (1.0, inf] │ + │ 2.5 ┆ inf ┆ (1.0, inf] │ + └──────┴─────────────┴──────────────┘ + + ''' + def qcut(self, quantiles: list[float]) -> DataFrame: + ''' + Bin values into discrete values based on their quantiles. + + Parameters + ---------- + quantiles + Quaniles to create. + We expect quantiles ``0.0 <= quantile <= 1`` + labels + Labels to assign to the quantiles. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut([0.0, 0.25, 0.75]) + shape: (8, 3) + ┌──────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪═══════════════╡ + │ -5.0 ┆ -5.0 ┆ (-inf, -5.0] │ + │ -4.0 ┆ -3.25 ┆ (-5.0, -3.25] │ + │ -3.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1.0 ┆ inf ┆ (0.25, inf] │ + │ 2.0 ┆ inf ┆ (0.25, inf] │ + └──────┴─────────────┴───────────────┘ + + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Return a copy of the Series with a new alias/name. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> srs = pl.Series("x", [1, 2, 3]) + >>> new_aliased_srs = srs.alias("y") + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Series: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.append(s2) + shape: (6,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + .. deprecated:: 0.15.16 + `Series.explode` will be removed in favour of `Series.arr.explode` and + `Series.str.explode`. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ListNameSpace.explode : Explode a list column. + StringNameSpace.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use pyarrow for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | pli.Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + ''' + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_skew(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 0.0 + 0.0 + 0.381802 + 0.0 + ] + + ''' + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def str(self): ... + @property + def struct(self): ... + +class SeriesIter: + def __init__(self, length: int, s: Series) -> None: ... + def __iter__(self) -> SeriesIter: ... + def __next__(self) -> Any: ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/dataframe/frame deleted file mode 100644 index 1dc9668..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/dataframe/frame +++ /dev/null @@ -1,282 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from io import BytesIO, IOBase -from pathlib import Path -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.expr import Expr as Expr -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.lazyframe import LazyFrame as LazyFrame -from polars.polars import PyDataFrame as PyDataFrame -from polars.series import Series as Series -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr -from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: set[str] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @columns.setter - def columns(self, names: Sequence[str]) -> None: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: ... - def _div(self, other: Any, floordiv: bool) -> Self: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: Iterator[str] | Sequence[str] | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy[Self]: ... - def groupby_rolling(self, index_column: str, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> RollingGroupBy[Self]: ... - def groupby_dynamic(self, index_column: str, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> DynamicGroupBy[Self]: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ...) -> Self: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> Self: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: Expr | int | float | None) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> Self: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ...) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> Self: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs): ... - def merge_sorted(self, other: DataFrame, key: str) -> Self: ... - def set_sorted(self, column: IntoExpr | Iterable[IntoExpr], *more_columns: IntoExpr, descending: bool = ...) -> Self: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/dataframe/frame.pyi new file mode 100644 index 0000000..e1d43ca --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/dataframe/frame.pyi @@ -0,0 +1,5583 @@ +#: version 0.17.9 +import P +import np as np +import pa as pa +import pd as pd +from _io import BytesIO + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr +from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> Self: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> Self: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> Self: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> Self: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> numpy_array = df.to_numpy() + >>> type(numpy_array) + + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to append to or create in the SQL database. + connection_uri + Connection uri, for example + + * "postgresql://username:password@server:port/database" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional generator/iterator that yields column names. Will be used to + replace the columns in the DataFrame. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> Self: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: str) -> RollingGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic* + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: str) -> DynamicGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + - \'window\': Truncate the start of the window with the \'every\' argument. + - \'datapoint\': Start from the first encountered data point. + - \'monday\': Start the window on the monday before the first data point. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + It is better to implement this with an expression: + + >>> df.select([pl.col("foo") * 2, pl.col("bar") * 3]) # doctest: +IGNORE_RESULT + + Return a Series by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, df: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + df + DataFrame to stack. + in_place + Modify in place + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: Self) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this + `DataFrame` `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. + For instance during online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a + query. For instance when you read in multiple files and when to store them in a + single `DataFrame`. In the latter case, finish the sequence of `vstack` + operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current DataFrame, with zero to \'n\' rows. + + Returns a DataFrame with identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> Self: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function : {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + A predefined aggregate function str or an expression. + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot( + ... values="baz", index="foo", columns="bar", aggregate_function="first" + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> Self: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.arange(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ E ┆ 4 │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, by: str | Iterable[str], *more_by: str) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialises all frame data as a list of rows. + item: Return dataframe element as a scalar. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.rows() + [(1, 2), (3, 4), (5, 6)] + >>> df.rows(named=True) + [{\'a\': 1, \'b\': 2}, {\'a\': 3, \'b\': 4}, {\'a\': 5, \'b\': 6}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters in your use-case + you should export to a different format. + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + An iterator of tuples (default) or dictionaries (if named) of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> Self: + ''' + Return Pearson product-moment correlation coefficients. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + keyword arguments are passed to numpy corrcoef + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: IntoExpr | Iterable[IntoExpr], *more_columns: IntoExpr) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/expr/expr deleted file mode 100644 index a32dc8d..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/expr/expr +++ /dev/null @@ -1,250 +0,0 @@ -from typing_extensions import ParamSpec, Generic - -from datetime import timedelta -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.lazyframe import LazyFrame as LazyFrame -from polars.polars import PyExpr as PyExpr -from polars.series import Series as Series -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias, redirect as redirect -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: set[str] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: Expr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: int | float | bool | str | Expr | list[Any], *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: Expr | str) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def arr(self) -> ExprListNameSpace: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/expr/expr.pyi new file mode 100644 index 0000000..8fcb12c --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/expr/expr.pyi @@ -0,0 +1,6122 @@ +#: version 0.17.9 +import P +import np as np +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias, redirect as redirect +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def all(self) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + pl.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the output of an expression. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + >>> df.select( + ... [ + ... pl.col("a").alias("bar"), + ... pl.col("b").alias("foo"), + ... ] + ... ) + shape: (3, 2) + ┌─────┬──────┐ + │ bar ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + + Keep original column name to undo an alias operation. + + >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent + "DuplicateError: Column with name: \'literal\' has more than one occurrences" + errors. + + >>> df.select([(pl.lit(10) / pl.all()).keep_name()]) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr): + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + ... + >>> df = pl.DataFrame({"a": ["a: 1", "b: 2", "c: 3"]}) + >>> df.with_columns(pl.col("a").pipe(extract_number)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().prefix("reverse_"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps root name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [3, 4], + ... } + ... ) + >>> df.select( + ... pl.all().reverse().map_alias(lambda colName: colName + "_reverse") + ... ) + shape: (2, 2) + ┌───────────┬───────────┐ + │ A_reverse ┆ B_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════════╪═══════════╡ + │ 2 ┆ 4 │ + │ 1 ┆ 3 │ + └───────────┴───────────┘ + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: Expr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └─────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: int | float | bool | str | Expr | list[Any]) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.fill_null(strategy="zero") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ 0 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(99) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ 99 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(strategy="forward") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.fill_nan("zero") + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪══════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ zero │ + │ zero ┆ 6.0 │ + └──────┴──────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 6 │ + │ null ┆ 6 │ + └──────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + ExprStringNameSpace.explode : Explode a string column. + + """ + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of logical exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: Expr | str) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill nulls with linear interpolation over missing values. + + Can also be used to regrid data to a new grid - see examples below. + + Parameters + ---------- + method : {\'linear\', \'linear\'} + Interpolation method + + Examples + -------- + >>> # Fill nulls with linear interpolation + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_min(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + └──────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_max(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 8.0, 6.0, 2.0, 16.0, 10.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_mean(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.5 │ + │ 7.0 │ + │ 4.0 │ + │ 9.0 │ + │ 13.0 │ + └──────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_sum(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 3.0 │ + │ 5.0 │ + │ 7.0 │ + │ 9.0 │ + │ 11.0 │ + └──────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_std(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 1.527525 │ + │ 2.0 │ + └──────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_var(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 2.333333 │ + │ 4.0 │ + └──────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_median(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_quantile(quantile=0.33, window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + └──────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + """ + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + """ + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/lazyframe/frame deleted file mode 100644 index bd19992..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/lazyframe/frame +++ /dev/null @@ -1,131 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.expr.expr import Expr as Expr -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.series import Series as Series -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, PolarsExprType as PolarsExprType, PythonLiteral as PythonLiteral, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath -from typing import Any, Callable, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: set[str] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_plan(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def describe_optimized_plan(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy[Self]: ... - def groupby_rolling(self, index_column: str, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ...) -> LazyGroupBy[Self]: ... - def groupby_dynamic(self, index_column: str, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ...) -> LazyGroupBy[Self]: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, exprs: str | PolarsExprType | PythonLiteral | Series | Iterable[str | PolarsExprType | PythonLiteral | Series | None] | None = ..., *more_exprs: str | PolarsExprType | PythonLiteral | Series | None, **named_exprs: str | PolarsExprType | PythonLiteral | Series | None) -> Self: ... - def with_context(self, other): ... - def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: IntoExpr | Iterable[IntoExpr], *more_columns: IntoExpr, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..17cfeca --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/lazyframe/frame.pyi @@ -0,0 +1,3298 @@ +#: version 0.17.9 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import expr_to_lit_or_expr as expr_to_lit_or_expr, selection_to_pyexpr_list as selection_to_pyexpr_list +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> Self: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def describe_plan(self) -> str: + ''' + Create a string representation of the unoptimized query plan. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + .. deprecated:: 0.16.10 + Use ``LazyFrame.explain`` + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).describe_plan() # doctest: +SKIP + + ''' + def describe_optimized_plan(self) -> str: + """Create a string representation of the optimized query plan.""" + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy[Self]: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: str) -> LazyGroupBy[Self]: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.groupby_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: str) -> LazyGroupBy[Self]: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + * \'datapoint\': Start from the first encountered data point. + * \'monday\': Start the window on the monday before the first data point. + + See Also + -------- + groupby_rolling + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, exprs: str | PolarsExprType | PythonLiteral | Series | Iterable[str | PolarsExprType | PythonLiteral | Series | None] | None = ..., *more_exprs: str | PolarsExprType | PythonLiteral | Series | None, **named_exprs: str | PolarsExprType | PythonLiteral | Series | None) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the DataFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: IntoExpr | Iterable[IntoExpr], *more_columns: IntoExpr) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/series/series deleted file mode 100644 index 0f163d5..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/series/series +++ /dev/null @@ -1,319 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import internals as pli -from polars.dataframe import DataFrame as DataFrame -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.expr.expr import Expr as Expr -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: set[str] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - @classmethod - def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - def __floordiv__(self, other): ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __iter__(self) -> SeriesIter: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, maintain_order: bool = ...) -> DataFrame: ... - def qcut(self, quantiles: list[float], *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | pli.Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - @property - def arr(self) -> ListNameSpace: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -class SeriesIter: - len: int - i: int - s: Series - def __init__(self, length: int, s: Series) -> None: ... - def __iter__(self) -> SeriesIter: ... - def __next__(self) -> Any: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/series/series.pyi new file mode 100644 index 0000000..81628f9 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.17.9/polars/series/series.pyi @@ -0,0 +1,4038 @@ +#: version 0.17.9 +import np as np +import pa as pa +import pd as pd +import pli +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar, Collection, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + @classmethod + def _repeat(cls, name: str, val: int | float | str | bool, n: int, dtype: PolarsDataType) -> Self: ... + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def le(self, other: Any) -> Self: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self: + """Method equivalent of operator expression ``series == other``.""" + def ne(self, other: Any) -> Self: + """Method equivalent of operator expression ``series != other``.""" + def ge(self, other: Any) -> Self: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __iter__(self) -> SeriesIter: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def any(self) -> bool: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self) -> bool: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ + │ 25% ┆ 2.0 │ + │ 75% ┆ 4.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ...) -> DataFrame: + ''' + Bin values into discrete values. + + Parameters + ---------- + bins + Bins to create. + labels + Labels to assign to the bins. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1]) + shape: (12, 3) + ┌──────┬─────────────┬──────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪══════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ + │ 1.5 ┆ inf ┆ (1.0, inf] │ + │ 2.0 ┆ inf ┆ (1.0, inf] │ + │ 2.5 ┆ inf ┆ (1.0, inf] │ + └──────┴─────────────┴──────────────┘ + + ''' + def qcut(self, quantiles: list[float]) -> DataFrame: + ''' + Bin values into discrete values based on their quantiles. + + Parameters + ---------- + quantiles + Quaniles to create. + We expect quantiles ``0.0 <= quantile <= 1`` + labels + Labels to assign to the quantiles. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut([0.0, 0.25, 0.75]) + shape: (8, 3) + ┌──────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪═══════════════╡ + │ -5.0 ┆ -5.0 ┆ (-inf, -5.0] │ + │ -4.0 ┆ -3.25 ┆ (-5.0, -3.25] │ + │ -3.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1.0 ┆ inf ┆ (0.25, inf] │ + │ 2.0 ┆ inf ┆ (0.25, inf] │ + └──────┴─────────────┴───────────────┘ + + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Return a copy of the Series with a new alias/name. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> srs = pl.Series("x", [1, 2, 3]) + >>> new_aliased_srs = srs.alias("y") + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Series: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.append(s2) + shape: (6,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list or utf8 Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ListNameSpace.explode : Explode a list column. + StringNameSpace.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use pyarrow for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | pli.Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + ''' + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_skew(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 0.0 + 0.0 + 0.381802 + 0.0 + ] + + ''' + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).rename("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def arr(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def str(self): ... + @property + def struct(self): ... + +class SeriesIter: + def __init__(self, length: int, s: Series) -> None: ... + def __iter__(self) -> SeriesIter: ... + def __next__(self) -> Any: ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/dataframe/frame deleted file mode 100644 index 929aef6..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/dataframe/frame +++ /dev/null @@ -1,282 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Null as Null, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, Struct as Struct, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: set[str] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @columns.setter - def columns(self, names: Sequence[str]) -> None: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ...) -> DataFrame: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ...) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs): ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/dataframe/frame.pyi new file mode 100644 index 0000000..42758b7 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/dataframe/frame.pyi @@ -0,0 +1,5728 @@ +#: version 0.18.0 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Null as Null, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, Struct as Struct, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to append to or create in the SQL database. + connection_uri + Connection uri, for example + + * "postgresql://username:password@server:port/database" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Note: Some polars data types like `Null`, `Categorical` and `Time` are + not supported by the delta protocol specification. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Examples + -------- + Instantiate a basic dataframe: + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + + Write DataFrame as a Delta Lake table on local filesystem. + + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on local filesystem. + Note: This will fail if schema of the new data does not match the + schema of existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + Note: If the schema of the new and old data is same, + then setting `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table on cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable that yields column names. Will be used to + replace the columns in the DataFrame. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic* + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + - \'window\': Truncate the start of the window with the \'every\' argument. + - \'datapoint\': Start from the first encountered data point. + - \'monday\': Start the window on the monday before the first data point. + - \'tuesday\': Start the window on the tuesday before the first data point. + - ... + - \'sunday\': Start the window on the sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + It is better to implement this with an expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a Series by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, df: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + df + DataFrame to stack. + in_place + Modify in place + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: Self) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this + `DataFrame` `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. + For instance during online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a + query. For instance when you read in multiple files and when to store them in a + single `DataFrame`. In the latter case, finish the sequence of `vstack` + operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty copy of the current DataFrame, with zero to \'n\' rows. + + Returns a DataFrame with identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function : {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + A predefined aggregate function str or an expression. + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot( + ... values="baz", index="foo", columns="bar", aggregate_function="first" + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.arange(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ E ┆ 4 │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, by: str | Iterable[str], *more_by: str) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialises all frame data as a list of rows. + item: Return dataframe element as a scalar. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.rows() + [(1, 2), (3, 4), (5, 6)] + >>> df.rows(named=True) + [{\'a\': 1, \'b\': 2}, {\'a\': 3, \'b\': 4}, {\'a\': 5, \'b\': 6}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters in your use-case + you should export to a different format. + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + An iterator of tuples (default) or dictionaries (if named) of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return Pearson product-moment correlation coefficients. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + keyword arguments are passed to numpy corrcoef + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/expr/expr deleted file mode 100644 index 1795740..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/expr/expr +++ /dev/null @@ -1,255 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: set[str] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: Expr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/expr/expr.pyi new file mode 100644 index 0000000..4431d52 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/expr/expr.pyi @@ -0,0 +1,6241 @@ +#: version 0.18.0 +import P +import np as np +import pl +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def all(self) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + pl.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the output of an expression. + + Parameters + ---------- + name + New name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + >>> df.select( + ... pl.col("a").alias("bar"), + ... pl.col("b").alias("foo"), + ... ) + shape: (3, 2) + ┌─────┬──────┐ + │ bar ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + + Keep original column name to undo an alias operation. + + >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent + "DuplicateError: Column with name: \'literal\' has more than one occurrences" + errors. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to root column name. + + See Also + -------- + alias + map_alias + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().prefix("reverse_"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to root column name. + + See Also + -------- + alias + map_alias + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps root name to new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [3, 4], + ... } + ... ) + + >>> df.select(pl.all().reverse().suffix("_reverse")).with_columns( + ... pl.all().map_alias( + ... # Remove "_reverse" suffix and convert to lower case. + ... lambda col_name: col_name.rsplit("_reverse", 1)[0].lower() + ... ) + ... ) + shape: (2, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 2 ┆ 4 ┆ 2 ┆ 4 │ + │ 1 ┆ 3 ┆ 1 ┆ 3 │ + └───────────┴───────────┴─────┴─────┘ + + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: Expr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.fill_null(strategy="zero") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ 0 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(99) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ 99 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(strategy="forward") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.fill_nan("zero") + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪══════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ zero │ + │ zero ┆ 6.0 │ + └──────┴──────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 6 │ + │ null ┆ 6 │ + └──────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to there + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + See Also + -------- + map_dict + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + ExprStringNameSpace.explode : Explode a string column. + + """ + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + """ + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + """ + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of logical exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill nulls with linear interpolation over missing values. + + Can also be used to regrid data to a new grid - see examples below. + + Parameters + ---------- + method : {\'linear\', \'linear\'} + Interpolation method + + Examples + -------- + >>> # Fill nulls with linear interpolation + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_min(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + └──────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_max(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 8.0, 6.0, 2.0, 16.0, 10.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_mean(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.5 │ + │ 7.0 │ + │ 4.0 │ + │ 9.0 │ + │ 13.0 │ + └──────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_sum(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 3.0 │ + │ 5.0 │ + │ 7.0 │ + │ 9.0 │ + │ 11.0 │ + └──────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_std(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 1.527525 │ + │ 2.0 │ + └──────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_var(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 2.333333 │ + │ 4.0 │ + └──────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_median(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_quantile(quantile=0.33, window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + └──────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + """ + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + """ + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f32 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + This can actually hurt performance and can have a lot of contention. + It is advised not to use it until actually benchmarked on your problem. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/lazyframe/frame deleted file mode 100644 index 598151e..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/lazyframe/frame +++ /dev/null @@ -1,128 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr, Series as Series -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath -from typing import Any, Callable, Collection, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: set[str] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: ... - def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other): ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..74ff3e2 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/lazyframe/frame.pyi @@ -0,0 +1,3311 @@ +#: version 0.18.0 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + exprs + Column(s) to select. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to select, specified as positional arguments. + **named_exprs + Additional columns to select, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time column. + + Also works for index values of type Int32 or Int64. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.groupby_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + * \'datapoint\': Start from the first encountered data point. + * \'monday\': Start the window on the monday before the first data point. + * \'tuesday\': Start the window on the tuesday before the first data point. + * ... + * \'sunday\': Start the window on the sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_rolling + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, exprs: IntoExpr | Iterable[IntoExpr] | None = ..., *more_exprs: IntoExpr, **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + exprs + Column or columns to add. Accepts expression input. Strings are parsed + as column names, other non-expression inputs are parsed as literals. + *more_exprs + Additional columns to add, specified as positional arguments. + **named_exprs + Additional columns to add, specified as keyword arguments. The columns + will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/series/series deleted file mode 100644 index e3f2272..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/series/series +++ /dev/null @@ -1,336 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, find_stacklevel as find_stacklevel, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Generator, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: set[str] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Self: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Self: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Self: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Self: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Self: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - def __floordiv__(self, other): ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, maintain_order: bool = ...) -> DataFrame: ... - def qcut(self, quantiles: list[float], *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool | None = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/series/series.pyi new file mode 100644 index 0000000..4717eb0 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.0/polars/series/series.pyi @@ -0,0 +1,4061 @@ +#: version 0.18.0 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, find_stacklevel as find_stacklevel, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self | Expr: ... + def __ne__(self, other: Any) -> Self | Expr: ... + def __gt__(self, other: Any) -> Self | Expr: ... + def __lt__(self, other: Any) -> Self | Expr: ... + def __ge__(self, other: Any) -> Self | Expr: ... + def __le__(self, other: Any) -> Self | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def any(self) -> bool: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self) -> bool: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ + │ 25% ┆ 2.0 │ + │ 75% ┆ 4.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ...) -> DataFrame: + ''' + Bin values into discrete values. + + Parameters + ---------- + bins + Bins to create. + labels + Labels to assign to the bins. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1]) + shape: (12, 3) + ┌──────┬─────────────┬──────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪══════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ + │ 1.5 ┆ inf ┆ (1.0, inf] │ + │ 2.0 ┆ inf ┆ (1.0, inf] │ + │ 2.5 ┆ inf ┆ (1.0, inf] │ + └──────┴─────────────┴──────────────┘ + + ''' + def qcut(self, quantiles: list[float]) -> DataFrame: + ''' + Bin values into discrete values based on their quantiles. + + Parameters + ---------- + quantiles + Quaniles to create. + We expect quantiles ``0.0 <= quantile <= 1`` + labels + Labels to assign to the quantiles. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut([0.0, 0.25, 0.75]) + shape: (8, 3) + ┌──────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪═══════════════╡ + │ -5.0 ┆ -5.0 ┆ (-inf, -5.0] │ + │ -4.0 ┆ -3.25 ┆ (-5.0, -3.25] │ + │ -3.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1.0 ┆ inf ┆ (0.25, inf] │ + │ 2.0 ┆ inf ┆ (0.25, inf] │ + └──────┴─────────────┴───────────────┘ + + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Return a copy of the Series with a new alias/name. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> srs = pl.Series("x", [1, 2, 3]) + >>> new_aliased_srs = srs.alias("y") + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Series: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.append(s2) + shape: (6,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ListNameSpace.explode : Explode a list column. + StringNameSpace.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + ''' + Compute a rolling skew. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_skew(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 0.0 + 0.0 + 0.381802 + 0.0 + ] + + ''' + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/dataframe/frame deleted file mode 100644 index 060ccc4..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/dataframe/frame +++ /dev/null @@ -1,282 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase, TextIOWrapper -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Null as Null, Object as Object, Struct as Struct, Time as Time, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IndexOrder as IndexOrder, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SelectorType as SelectorType, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, ClassVar, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: ClassVar[set[str]] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ..., ignore_errors: bool = ...) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def flags(self) -> dict[str, dict[str, bool]]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ..., *, order: IndexOrder = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ..., freeze_panes: str | tuple[int, int] | tuple[str, int, int] | tuple[int, int, int, int] | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: str | Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, other: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: DataFrame) -> Self: ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ..., drop_first: bool = ...) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - def rows_by_key(self, key: str | Sequence[str] | SelectorType, *, named: bool = ..., include_key: bool = ..., unique: bool = ...) -> dict[Any, Iterable[Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs: Any) -> DataFrame: ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/dataframe/frame.pyi new file mode 100644 index 0000000..25dcf28 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/dataframe/frame.pyi @@ -0,0 +1,6045 @@ +#: version 0.18.11 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Null as Null, Object as Object, Struct as Struct, Time as Time, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + ``structured`` is set to ``False`` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + If you pass ``partition_cols`` here, the dataset will be written + using ``pyarrow.parquet.write_to_dataset``. + The ``partition_cols`` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, *args, **kwargs) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Note: Some polars data types like `Null`, `Categorical` and `Time` are + not supported by the delta protocol specification. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Examples + -------- + Instantiate a basic dataframe: + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + + Write DataFrame as a Delta Lake table on local filesystem. + + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on local filesystem. + Note: This will fail if schema of the new data does not match the + schema of existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + Note: If the schema of the new and old data is same, + then setting `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table on cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, *args, **kwargs) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from ``vstack`` which adds the chunks from ``other`` to the chunks of + this ``DataFrame``, ``extend`` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``vstack`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer ``vstack`` over ``extend`` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single ``DataFrame``. In the latter case, finish the sequence of + ``vstack`` operations with a ``rechunk``. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot( + ... values="baz", index="foo", columns="bar", aggregate_function="first" + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.int_range(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ E ┆ 4 │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, by: str | Iterable[str], *more_by: str) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using ``iter_rows`` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + ''' + def rows_by_key(self, key: str | Sequence[str] | SelectorType) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy ``corrcoef``. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/expr/expr deleted file mode 100644 index 48010f8..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/expr/expr +++ /dev/null @@ -1,265 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.exceptions import PolarsInefficientApplyWarning as PolarsInefficientApplyWarning, PolarsPanicError as PolarsPanicError -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: ClassVar[set[str]] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self, drop_nulls: bool = ...) -> Self: ... - def all(self, drop_nulls: bool = ...) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def cbrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def keep_name(self) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: IntoExpr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def cut(self, breaks: list[float], labels: list[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: ... - def qcut(self, q: list[float] | int, labels: list[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> Self: ... - def rle(self) -> Self: ... - def rle_id(self) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | None | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def degrees(self) -> Self: ... - def radians(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ..., fixed_seed: bool = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ..., fixed_seed: bool = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/expr/expr.pyi new file mode 100644 index 0000000..7739136 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/expr/expr.pyi @@ -0,0 +1,7702 @@ +#: version 0.18.11 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientApplyWarning as PolarsInefficientApplyWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self, drop_nulls: bool = ...) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Parameters + ---------- + drop_nulls + If False, return None if there are nulls but no Trues. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + >>> df = pl.DataFrame(dict(x=[None, False], y=[None, True])) + >>> df.select(pl.col("x").any(True), pl.col("y").any(True)) + shape: (1, 2) + ┌───────┬──────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪══════╡ + │ false ┆ true │ + └───────┴──────┘ + >>> df.select(pl.col("x").any(False), pl.col("y").any(False)) + shape: (1, 2) + ┌──────┬──────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪══════╡ + │ null ┆ true │ + └──────┴──────┘ + + ''' + def all(self, drop_nulls: bool = ...) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Parameters + ---------- + drop_nulls + If False, return None if there are any nulls. + + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + >>> df = pl.DataFrame(dict(x=[None, False], y=[None, True])) + >>> df.select(pl.col("x").all(True), pl.col("y").all(True)) + shape: (1, 2) + ┌───────┬───────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + └───────┴───────┘ + >>> df.select(pl.col("x").all(False), pl.col("y").all(False)) + shape: (1, 2) + ┌──────┬──────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪══════╡ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().map_alias(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").keep_name()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: list[float], labels: list[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + A list of unique cut points. + labels + Labels to assign to bins. If given, the length must be len(breaks) + 1. + left_closed + Whether intervals should be [) instead of the default of (] + include_breaks + Include the the right endpoint of the bin each observation falls in. + If True, the resulting column will be a Struct. + + Examples + -------- + >>> g = pl.repeat("a", 5, eager=True).append(pl.repeat("b", 5, eager=True)) + >>> df = pl.DataFrame(dict(g=g, x=range(10))) + >>> df.with_columns(q=pl.col("x").cut([2, 5])) + shape: (10, 3) + ┌─────┬─────┬───────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═══════════╡ + │ a ┆ 0 ┆ (-inf, 2] │ + │ a ┆ 1 ┆ (-inf, 2] │ + │ a ┆ 2 ┆ (-inf, 2] │ + │ a ┆ 3 ┆ (2, 5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (5, inf] │ + │ b ┆ 7 ┆ (5, inf] │ + │ b ┆ 8 ┆ (5, inf] │ + │ b ┆ 9 ┆ (5, inf] │ + └─────┴─────┴───────────┘ + >>> df.with_columns(q=pl.col("x").cut([2, 5], left_closed=True)) + shape: (10, 3) + ┌─────┬─────┬───────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═══════════╡ + │ a ┆ 0 ┆ [-inf, 2) │ + │ a ┆ 1 ┆ [-inf, 2) │ + │ a ┆ 2 ┆ [2, 5) │ + │ a ┆ 3 ┆ [2, 5) │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ [5, inf) │ + │ b ┆ 7 ┆ [5, inf) │ + │ b ┆ 8 ┆ [5, inf) │ + │ b ┆ 9 ┆ [5, inf) │ + └─────┴─────┴───────────┘ + ''' + def qcut(self, *args, **kwargs) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + q + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of evenly spaced probabilities to use. + labels + Labels to assign to bins. If given, the length must be len(probs) + 1. + If computing over groups this must be set for now. + left_closed + Whether intervals should be [) instead of the default of (] + allow_duplicates + If True, the resulting quantile breaks don\'t have to be unique. This can + happen even with unique probs depending on the data. Duplicates will be + dropped, resulting in fewer bins. + include_breaks + Include the the right endpoint of the bin each observation falls in. + If True, the resulting column will be a Struct. + + + Examples + -------- + >>> g = pl.repeat("a", 5, eager=True).append(pl.repeat("b", 5, eager=True)) + >>> df = pl.DataFrame(dict(g=g, x=range(10))) + >>> df.with_columns(q=pl.col("x").qcut([0.5])) + shape: (10, 3) + ┌─────┬─────┬─────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════════════╡ + │ a ┆ 0 ┆ (-inf, 4.5] │ + │ a ┆ 1 ┆ (-inf, 4.5] │ + │ a ┆ 2 ┆ (-inf, 4.5] │ + │ a ┆ 3 ┆ (-inf, 4.5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (4.5, inf] │ + │ b ┆ 7 ┆ (4.5, inf] │ + │ b ┆ 8 ┆ (4.5, inf] │ + │ b ┆ 9 ┆ (4.5, inf] │ + └─────┴─────┴─────────────┘ + >>> df.with_columns(q=pl.col("x").qcut(2)) + shape: (10, 3) + ┌─────┬─────┬─────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════════════╡ + │ a ┆ 0 ┆ (-inf, 4.5] │ + │ a ┆ 1 ┆ (-inf, 4.5] │ + │ a ┆ 2 ┆ (-inf, 4.5] │ + │ a ┆ 3 ┆ (-inf, 4.5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (4.5, inf] │ + │ b ┆ 7 ┆ (4.5, inf] │ + │ b ┆ 8 ┆ (4.5, inf] │ + │ b ┆ 9 ┆ (4.5, inf] │ + └─────┴─────┴─────────────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.5], ["lo", "hi"]).over("g")) + shape: (10, 3) + ┌─────┬─────┬─────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ lo │ + │ a ┆ 1 ┆ lo │ + │ a ┆ 2 ┆ lo │ + │ a ┆ 3 ┆ hi │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ lo │ + │ b ┆ 7 ┆ lo │ + │ b ┆ 8 ┆ hi │ + │ b ┆ 9 ┆ hi │ + └─────┴─────┴─────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.5], ["lo", "hi"], True).over("g")) + shape: (10, 3) + ┌─────┬─────┬─────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ lo │ + │ a ┆ 1 ┆ lo │ + │ a ┆ 2 ┆ hi │ + │ a ┆ 3 ┆ hi │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ lo │ + │ b ┆ 7 ┆ hi │ + │ b ┆ 8 ┆ hi │ + │ b ┆ 9 ┆ hi │ + └─────┴─────┴─────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.25, 0.5], include_breaks=True)) + shape: (10, 3) + ┌─────┬─────┬───────────────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ struct[2] │ + ╞═════╪═════╪═══════════════════════╡ + │ a ┆ 0 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 1 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 2 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 3 ┆ {4.5,"(2.25, 4.5]"} │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 7 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 8 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 9 ┆ {inf,"(4.5, inf]"} │ + └─────┴─────┴───────────────────────┘ + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq_missing(pl.col("y")).alias("x == y"), + ... ) + shape: (6, 3) + ┌──────┬──────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞══════╪══════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + │ null ┆ 5.0 ┆ false │ + │ null ┆ null ┆ true │ + └──────┴──────┴────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne_missing(pl.col("y")).alias("x != y"), + ... ) + shape: (6, 3) + ┌──────┬──────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞══════╪══════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + │ null ┆ 5.0 ┆ true │ + │ null ┆ null ┆ false │ + └──────┴──────┴────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, *args, **kwargs) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, *args, **kwargs) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, *args, **kwargs) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, *args, **kwargs) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, *args, **kwargs) -> Self: + ''' + Compute a rolling standard deviation. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, *args, **kwargs) -> Self: + ''' + Compute a rolling variance. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, *args, **kwargs) -> Self: + ''' + Compute a rolling median. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, *args, **kwargs) -> Self: + ''' + Compute a rolling quantile. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f32 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ..., fixed_seed: bool = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + fixed_seed + If True, The seed will not be incremented between draws. + This can make output predictable because draw ordering can + change due to threads being scheduled in a different order. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + fixed_seed + If True, The seed will not be incremented between draws. + This can make output predictable because draw ordering can + change due to threads being scheduled in a different order. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Expr + Expression of data type :class:`Struct`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self, *args, **kwargs) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/lazyframe/frame deleted file mode 100644 index 1032b72..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/lazyframe/frame +++ /dev/null @@ -1,129 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath -from typing import Any, Callable, ClassVar, Collection, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: ClassVar[set[str]] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | Path | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, predicate: IntoExpr) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other: Self | list[Self]) -> Self: ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..03446ac --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/lazyframe/frame.pyi @@ -0,0 +1,3442 @@ +#: version 0.18.11 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self, *args, **kwargs) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self, *args, **kwargs) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self, *args, **kwargs) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self, *args, **kwargs) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, *args, **kwargs) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: IntoExpr) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.groupby_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_rolling + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/series/series deleted file mode 100644 index ef43da3..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/series/series +++ /dev/null @@ -1,345 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.deprecation import deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Generator, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: ClassVar[set[str]] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Self: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Self: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Self: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Self: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Self: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - @overload - def __floordiv__(self, other: Expr) -> Expr: ... - @overload - def __floordiv__(self, other: Any) -> Series: ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def cbrt(self) -> Series: ... - def any(self, drop_nulls: bool = ...) -> bool | None: ... - def all(self, drop_nulls: bool = ...) -> bool | None: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | None | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, breaks: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, series: bool = ..., left_closed: bool = ..., include_breaks: bool = ...) -> DataFrame | Series: ... - def qcut(self, q: list[float] | int, *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., series: bool = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> DataFrame | Series: ... - def rle(self) -> Series: ... - def rle_id(self) -> Series: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool | None = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool | None = ...) -> Self: ... - def extend(self, other: Series) -> Self: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series[Any]: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/series/series.pyi new file mode 100644 index 0000000..63f6b8a --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.11/polars/series/series.pyi @@ -0,0 +1,4383 @@ +#: version 0.18.11 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.deprecation import deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self | Expr: ... + def __ne__(self, other: Any) -> Self | Expr: ... + def __gt__(self, other: Any) -> Self | Expr: ... + def __lt__(self, other: Any) -> Self | Expr: ... + def __ge__(self, other: Any) -> Self | Expr: ... + def __le__(self, other: Any) -> Self | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self, drop_nulls: bool = ...) -> bool | None: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def all(self, drop_nulls: bool = ...) -> bool | None: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ + │ 25% ┆ 2.0 │ + │ 75% ┆ 4.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, *args, **kwargs) -> DataFrame | Series: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + A list of unique cut points. + labels + Labels to assign to the bins. If given the length of labels must be + len(breaks) + 1. + break_point_label + Name given to the breakpoint column/field. Only used if series == False or + include_breaks == True + category_label + Name given to the category column. Only used if series == False + series + If True, return a categorical Series in the data\'s original order. + left_closed + Whether intervals should be [) instead of (] + include_breaks + Include the the right endpoint of the bin each observation falls in. + If returning a DataFrame, it will be a column, and if returning a Series + it will be a field in a Struct + + Returns + ------- + DataFrame or Series + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1], series=False) + shape: (12, 3) + ┌──────┬─────────────┬────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1, 1] │ + │ 1.5 ┆ inf ┆ (1, inf] │ + │ 2.0 ┆ inf ┆ (1, inf] │ + │ 2.5 ┆ inf ┆ (1, inf] │ + └──────┴─────────────┴────────────┘ + >>> a.cut([-1, 1], series=True) + shape: (12,) + Series: \'a\' [cat] + [ + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-1, 1]" + "(-1, 1]" + "(-1, 1]" + "(-1, 1]" + "(1, inf]" + "(1, inf]" + "(1, inf]" + ] + >>> a.cut([-1, 1], series=True, left_closed=True) + shape: (12,) + Series: \'a\' [cat] + [ + "[-inf, -1)" + "[-inf, -1)" + "[-inf, -1)" + "[-inf, -1)" + "[-1, 1)" + "[-1, 1)" + "[-1, 1)" + "[-1, 1)" + "[1, inf)" + "[1, inf)" + "[1, inf)" + "[1, inf)" + ] + ''' + def qcut(self, *args, **kwargs) -> DataFrame | Series: + ''' + Discretize continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + q + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of evenly spaced probabilities to use. + labels + Labels to assign to the quantiles. If given the length of labels must be + len(breaks) + 1. + break_point_label + Name given to the breakpoint column/field. Only used if series == False or + include_breaks == True + category_label + Name given to the category column. Only used if series == False. + series + If True, return a categorical Series in the data\'s original order + left_closed + Whether intervals should be [) instead of (] + allow_duplicates + If True, the resulting quantile breaks don\'t have to be unique. This can + happen even with unique probs depending on the data. Duplicates will be + dropped, resulting in fewer bins. + include_breaks + Include the the right endpoint of the bin each observation falls in. + If returning a DataFrame, it will be a column, and if returning a Series + it will be a field in a Struct + + Returns + ------- + DataFrame or Series + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut(2, series=True) + shape: (8,) + Series: \'a\' [cat] + [ + "(-inf, -1.5]" + "(-inf, -1.5]" + "(-inf, -1.5]" + "(-inf, -1.5]" + "(-1.5, inf]" + "(-1.5, inf]" + "(-1.5, inf]" + "(-1.5, inf]" + ] + >>> a.qcut([0.0, 0.25, 0.75], series=False) + shape: (8, 3) + ┌─────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪═══════════════╡ + │ -5 ┆ -5.0 ┆ (-inf, -5] │ + │ -4 ┆ -3.25 ┆ (-5, -3.25] │ + │ -3 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1 ┆ inf ┆ (0.25, inf] │ + │ 2 ┆ inf ┆ (0.25, inf] │ + └─────┴─────────────┴───────────────┘ + >>> a.qcut([0.0, 0.25, 0.75], series=True) + shape: (8,) + Series: \'a\' [cat] + [ + "(-inf, -5]" + "(-5, -3.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(0.25, inf]" + "(0.25, inf]" + ] + >>> a.qcut([0.0, 0.25, 0.75], series=True, left_closed=True) + shape: (8,) + Series: \'a\' [cat] + [ + "[-5, -3.25)" + "[-5, -3.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[0.25, inf)" + "[0.25, inf)" + ] + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and ``append`` will change to always + behave like ``append_chunks=True`` (the previous default). For the + behavior of ``append_chunks=False``, use ``Series.extend``. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from ``append``, which adds the chunks from ``other`` to the chunks of + this series, ``extend`` appends the data from ``other`` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``append`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer ``append`` over ``extend`` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single ``Series``. In the latter case, finish the sequence + of ``append`` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point ``nan`` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set ``zero_copy_only=True``. + + Alternatively, if you want a zero-copy view and know what you are doing, + use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + Series + The mutated series. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/dataframe/frame deleted file mode 100644 index 6875270..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/dataframe/frame +++ /dev/null @@ -1,283 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase, TextIOWrapper -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Null as Null, Object as Object, Struct as Struct, Time as Time, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IndexOrder as IndexOrder, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SelectorType as SelectorType, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_renamed_methods as deprecate_renamed_methods, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, ClassVar, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: ClassVar[set[str]] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ..., ignore_errors: bool = ...) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def flags(self) -> dict[str, dict[str, bool]]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ..., *, order: IndexOrder = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ..., freeze_panes: str | tuple[int, int] | tuple[str, int, int] | tuple[int, int, int, int] | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, Any] | None = ...) -> None: ... - def write_database(self, table_name: str, connection: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: str | Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, other: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: DataFrame) -> Self: ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ..., drop_first: bool = ...) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - def rows_by_key(self, key: str | Sequence[str] | SelectorType, *, named: bool = ..., include_key: bool = ..., unique: bool = ...) -> dict[Any, Iterable[Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs: Any) -> DataFrame: ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/dataframe/frame.pyi new file mode 100644 index 0000000..dce78d4 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/dataframe/frame.pyi @@ -0,0 +1,6060 @@ +#: version 0.18.12 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Null as Null, Object as Object, Struct as Struct, Time as Time, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_renamed_methods as deprecate_renamed_methods, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + ``nan_as_null`` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + Polars currently relies on pyarrow's implementation of the dataframe interchange + protocol. Therefore, pyarrow>=11.0.0 is required for this method to work. + + Because Polars can not currently guarantee zero-copy conversion to Arrow for + categorical columns, ``allow_copy=False`` will not work if the dataframe + contains categorical data. + + """ + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + ``structured`` is set to ``False`` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + If you pass ``partition_cols`` here, the dataset will be written + using ``pyarrow.parquet.write_to_dataset``. + The ``partition_cols`` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, *args, **kwargs) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Note: Some polars data types like `Null`, `Categorical` and `Time` are + not supported by the delta protocol specification. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Examples + -------- + Instantiate a basic dataframe: + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + + Write DataFrame as a Delta Lake table on local filesystem. + + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on local filesystem. + Note: This will fail if schema of the new data does not match the + schema of existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + Note: If the schema of the new and old data is same, + then setting `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table on cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, *args, **kwargs) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from ``vstack`` which adds the chunks from ``other`` to the chunks of + this ``DataFrame``, ``extend`` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``vstack`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer ``vstack`` over ``extend`` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single ``DataFrame``. In the latter case, finish the sequence of + ``vstack`` operations with a ``rechunk``. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot( + ... values="baz", index="foo", columns="bar", aggregate_function="first" + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.int_range(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ E ┆ 4 │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, by: str | Iterable[str], *more_by: str) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using ``iter_rows`` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + ''' + def rows_by_key(self, key: str | Sequence[str] | SelectorType) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy ``corrcoef``. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/expr/expr deleted file mode 100644 index ddba7d1..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/expr/expr +++ /dev/null @@ -1,265 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.exceptions import PolarsInefficientApplyWarning as PolarsInefficientApplyWarning -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: ClassVar[set[str]] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self, drop_nulls: bool = ...) -> Self: ... - def all(self, drop_nulls: bool = ...) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def cbrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def keep_name(self) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: IntoExpr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def cut(self, breaks: list[float], labels: list[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: ... - def qcut(self, quantiles: list[float] | int, labels: list[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> Self: ... - def rle(self) -> Self: ... - def rle_id(self) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | None | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def degrees(self) -> Self: ... - def radians(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ..., fixed_seed: bool = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ..., fixed_seed: bool = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/expr/expr.pyi new file mode 100644 index 0000000..f34b6e5 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/expr/expr.pyi @@ -0,0 +1,7701 @@ +#: version 0.18.12 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientApplyWarning as PolarsInefficientApplyWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self, drop_nulls: bool = ...) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Parameters + ---------- + drop_nulls + If False, return None if there are nulls but no Trues. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + >>> df = pl.DataFrame(dict(x=[None, False], y=[None, True])) + >>> df.select(pl.col("x").any(True), pl.col("y").any(True)) + shape: (1, 2) + ┌───────┬──────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪══════╡ + │ false ┆ true │ + └───────┴──────┘ + >>> df.select(pl.col("x").any(False), pl.col("y").any(False)) + shape: (1, 2) + ┌──────┬──────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪══════╡ + │ null ┆ true │ + └──────┴──────┘ + + ''' + def all(self, drop_nulls: bool = ...) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Parameters + ---------- + drop_nulls + If False, return None if there are any nulls. + + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + >>> df = pl.DataFrame(dict(x=[None, False], y=[None, True])) + >>> df.select(pl.col("x").all(True), pl.col("y").all(True)) + shape: (1, 2) + ┌───────┬───────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + └───────┴───────┘ + >>> df.select(pl.col("x").all(False), pl.col("y").all(False)) + shape: (1, 2) + ┌──────┬──────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪══════╡ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().map_alias(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").keep_name()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: list[float], labels: list[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + A list of unique cut points. + labels + Labels to assign to bins. If given, the length must be len(breaks) + 1. + left_closed + Whether intervals should be [) instead of the default of (] + include_breaks + Include the the right endpoint of the bin each observation falls in. + If True, the resulting column will be a Struct. + + Examples + -------- + >>> g = pl.repeat("a", 5, eager=True).append(pl.repeat("b", 5, eager=True)) + >>> df = pl.DataFrame(dict(g=g, x=range(10))) + >>> df.with_columns(q=pl.col("x").cut([2, 5])) + shape: (10, 3) + ┌─────┬─────┬───────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═══════════╡ + │ a ┆ 0 ┆ (-inf, 2] │ + │ a ┆ 1 ┆ (-inf, 2] │ + │ a ┆ 2 ┆ (-inf, 2] │ + │ a ┆ 3 ┆ (2, 5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (5, inf] │ + │ b ┆ 7 ┆ (5, inf] │ + │ b ┆ 8 ┆ (5, inf] │ + │ b ┆ 9 ┆ (5, inf] │ + └─────┴─────┴───────────┘ + >>> df.with_columns(q=pl.col("x").cut([2, 5], left_closed=True)) + shape: (10, 3) + ┌─────┬─────┬───────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═══════════╡ + │ a ┆ 0 ┆ [-inf, 2) │ + │ a ┆ 1 ┆ [-inf, 2) │ + │ a ┆ 2 ┆ [2, 5) │ + │ a ┆ 3 ┆ [2, 5) │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ [5, inf) │ + │ b ┆ 7 ┆ [5, inf) │ + │ b ┆ 8 ┆ [5, inf) │ + │ b ┆ 9 ┆ [5, inf) │ + └─────┴─────┴───────────┘ + ''' + def qcut(self, *args, **kwargs) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of evenly spaced probabilities to use. + labels + Labels to assign to bins. If given, the length must be len(probs) + 1. + If computing over groups this must be set for now. + left_closed + Whether intervals should be [) instead of the default of (] + allow_duplicates + If True, the resulting quantile breaks don\'t have to be unique. This can + happen even with unique probs depending on the data. Duplicates will be + dropped, resulting in fewer bins. + include_breaks + Include the the right endpoint of the bin each observation falls in. + If True, the resulting column will be a Struct. + + Examples + -------- + >>> g = pl.repeat("a", 5, eager=True).append(pl.repeat("b", 5, eager=True)) + >>> df = pl.DataFrame(dict(g=g, x=range(10))) + >>> df.with_columns(q=pl.col("x").qcut([0.5])) + shape: (10, 3) + ┌─────┬─────┬─────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════════════╡ + │ a ┆ 0 ┆ (-inf, 4.5] │ + │ a ┆ 1 ┆ (-inf, 4.5] │ + │ a ┆ 2 ┆ (-inf, 4.5] │ + │ a ┆ 3 ┆ (-inf, 4.5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (4.5, inf] │ + │ b ┆ 7 ┆ (4.5, inf] │ + │ b ┆ 8 ┆ (4.5, inf] │ + │ b ┆ 9 ┆ (4.5, inf] │ + └─────┴─────┴─────────────┘ + >>> df.with_columns(q=pl.col("x").qcut(2)) + shape: (10, 3) + ┌─────┬─────┬─────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════════════╡ + │ a ┆ 0 ┆ (-inf, 4.5] │ + │ a ┆ 1 ┆ (-inf, 4.5] │ + │ a ┆ 2 ┆ (-inf, 4.5] │ + │ a ┆ 3 ┆ (-inf, 4.5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (4.5, inf] │ + │ b ┆ 7 ┆ (4.5, inf] │ + │ b ┆ 8 ┆ (4.5, inf] │ + │ b ┆ 9 ┆ (4.5, inf] │ + └─────┴─────┴─────────────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.5], ["lo", "hi"]).over("g")) + shape: (10, 3) + ┌─────┬─────┬─────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ lo │ + │ a ┆ 1 ┆ lo │ + │ a ┆ 2 ┆ lo │ + │ a ┆ 3 ┆ hi │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ lo │ + │ b ┆ 7 ┆ lo │ + │ b ┆ 8 ┆ hi │ + │ b ┆ 9 ┆ hi │ + └─────┴─────┴─────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.5], ["lo", "hi"], True).over("g")) + shape: (10, 3) + ┌─────┬─────┬─────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ lo │ + │ a ┆ 1 ┆ lo │ + │ a ┆ 2 ┆ hi │ + │ a ┆ 3 ┆ hi │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ lo │ + │ b ┆ 7 ┆ hi │ + │ b ┆ 8 ┆ hi │ + │ b ┆ 9 ┆ hi │ + └─────┴─────┴─────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.25, 0.5], include_breaks=True)) + shape: (10, 3) + ┌─────┬─────┬───────────────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ struct[2] │ + ╞═════╪═════╪═══════════════════════╡ + │ a ┆ 0 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 1 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 2 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 3 ┆ {4.5,"(2.25, 4.5]"} │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 7 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 8 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 9 ┆ {inf,"(4.5, inf]"} │ + └─────┴─────┴───────────────────────┘ + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq_missing(pl.col("y")).alias("x == y"), + ... ) + shape: (6, 3) + ┌──────┬──────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞══════╪══════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + │ null ┆ 5.0 ┆ false │ + │ null ┆ null ┆ true │ + └──────┴──────┴────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne_missing(pl.col("y")).alias("x != y"), + ... ) + shape: (6, 3) + ┌──────┬──────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞══════╪══════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + │ null ┆ 5.0 ┆ true │ + │ null ┆ null ┆ false │ + └──────┴──────┴────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, *args, **kwargs) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, *args, **kwargs) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, *args, **kwargs) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, *args, **kwargs) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, *args, **kwargs) -> Self: + ''' + Compute a rolling standard deviation. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, *args, **kwargs) -> Self: + ''' + Compute a rolling variance. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, *args, **kwargs) -> Self: + ''' + Compute a rolling median. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, *args, **kwargs) -> Self: + ''' + Compute a rolling quantile. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f32 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ..., fixed_seed: bool = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + fixed_seed + If True, The seed will not be incremented between draws. + This can make output predictable because draw ordering can + change due to threads being scheduled in a different order. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + fixed_seed + If True, The seed will not be incremented between draws. + This can make output predictable because draw ordering can + change due to threads being scheduled in a different order. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Expr + Expression of data type :class:`Struct`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self, *args, **kwargs) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/lazyframe/frame deleted file mode 100644 index dd2e518..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/lazyframe/frame +++ /dev/null @@ -1,137 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import dataframe_api_compat as dataframe_api_compat, subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_renamed_methods as deprecate_renamed_methods, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath -from typing import Any, Callable, ClassVar, Collection, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: ClassVar[set[str]] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def _comparison_error(self, operator: str) -> NoReturn: ... - def __eq__(self, other: Any) -> NoReturn: ... - def __ne__(self, other: Any) -> NoReturn: ... - def __gt__(self, other: Any) -> NoReturn: ... - def __lt__(self, other: Any) -> NoReturn: ... - def __ge__(self, other: Any) -> NoReturn: ... - def __le__(self, other: Any) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | Path | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, predicate: IntoExpr) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other: Self | list[Self]) -> Self: ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..751a40f --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/lazyframe/frame.pyi @@ -0,0 +1,3458 @@ +#: version 0.18.12 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_renamed_methods as deprecate_renamed_methods, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self, *args, **kwargs) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self, *args, **kwargs) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self, *args, **kwargs) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self, *args, **kwargs) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, *args, **kwargs) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: IntoExpr) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.groupby_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_rolling + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/series/series deleted file mode 100644 index f6e2e59..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/series/series +++ /dev/null @@ -1,346 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.deprecation import deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Generator, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: ClassVar[set[str]] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Self: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Self: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Self: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Self: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Self: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - @overload - def __floordiv__(self, other: Expr) -> Expr: ... - @overload - def __floordiv__(self, other: Any) -> Series: ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def __column_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def cbrt(self) -> Series: ... - def any(self, drop_nulls: bool = ...) -> bool | None: ... - def all(self, drop_nulls: bool = ...) -> bool | None: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | None | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, breaks: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, series: bool = ..., left_closed: bool = ..., include_breaks: bool = ...) -> DataFrame | Series: ... - def qcut(self, quantiles: list[float] | int, *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., series: bool = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> DataFrame | Series: ... - def rle(self) -> Series: ... - def rle_id(self) -> Series: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool | None = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool | None = ...) -> Self: ... - def extend(self, other: Series) -> Self: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series[Any]: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/series/series.pyi new file mode 100644 index 0000000..384c848 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.12/polars/series/series.pyi @@ -0,0 +1,4390 @@ +#: version 0.18.12 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.deprecation import deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self | Expr: ... + def __ne__(self, other: Any) -> Self | Expr: ... + def __gt__(self, other: Any) -> Self | Expr: ... + def __lt__(self, other: Any) -> Self | Expr: ... + def __ge__(self, other: Any) -> Self | Expr: ... + def __le__(self, other: Any) -> Self | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self, drop_nulls: bool = ...) -> bool | None: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def all(self, drop_nulls: bool = ...) -> bool | None: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ + │ 25% ┆ 2.0 │ + │ 75% ┆ 4.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, *args, **kwargs) -> DataFrame | Series: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + A list of unique cut points. + labels + Labels to assign to the bins. If given the length of labels must be + len(breaks) + 1. + break_point_label + Name given to the breakpoint column/field. Only used if series == False or + include_breaks == True + category_label + Name given to the category column. Only used if series == False + series + If True, return a categorical Series in the data\'s original order. + left_closed + Whether intervals should be [) instead of (] + include_breaks + Include the the right endpoint of the bin each observation falls in. + If returning a DataFrame, it will be a column, and if returning a Series + it will be a field in a Struct + + Returns + ------- + DataFrame or Series + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1], series=False) + shape: (12, 3) + ┌──────┬─────────────┬────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1, 1] │ + │ 1.5 ┆ inf ┆ (1, inf] │ + │ 2.0 ┆ inf ┆ (1, inf] │ + │ 2.5 ┆ inf ┆ (1, inf] │ + └──────┴─────────────┴────────────┘ + >>> a.cut([-1, 1], series=True) + shape: (12,) + Series: \'a\' [cat] + [ + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-1, 1]" + "(-1, 1]" + "(-1, 1]" + "(-1, 1]" + "(1, inf]" + "(1, inf]" + "(1, inf]" + ] + >>> a.cut([-1, 1], series=True, left_closed=True) + shape: (12,) + Series: \'a\' [cat] + [ + "[-inf, -1)" + "[-inf, -1)" + "[-inf, -1)" + "[-inf, -1)" + "[-1, 1)" + "[-1, 1)" + "[-1, 1)" + "[-1, 1)" + "[1, inf)" + "[1, inf)" + "[1, inf)" + "[1, inf)" + ] + ''' + def qcut(self, *args, **kwargs) -> DataFrame | Series: + ''' + Discretize continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of evenly spaced probabilities to use. + labels + Labels to assign to the quantiles. If given the length of labels must be + len(breaks) + 1. + break_point_label + Name given to the breakpoint column/field. Only used if series == False or + include_breaks == True + category_label + Name given to the category column. Only used if series == False. + series + If True, return a categorical Series in the data\'s original order + left_closed + Whether intervals should be [) instead of (] + allow_duplicates + If True, the resulting quantile breaks don\'t have to be unique. This can + happen even with unique probs depending on the data. Duplicates will be + dropped, resulting in fewer bins. + include_breaks + Include the the right endpoint of the bin each observation falls in. + If returning a DataFrame, it will be a column, and if returning a Series + it will be a field in a Struct + + Returns + ------- + DataFrame or Series + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut(2, series=True) + shape: (8,) + Series: \'a\' [cat] + [ + "(-inf, -1.5]" + "(-inf, -1.5]" + "(-inf, -1.5]" + "(-inf, -1.5]" + "(-1.5, inf]" + "(-1.5, inf]" + "(-1.5, inf]" + "(-1.5, inf]" + ] + >>> a.qcut([0.0, 0.25, 0.75], series=False) + shape: (8, 3) + ┌─────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪═══════════════╡ + │ -5 ┆ -5.0 ┆ (-inf, -5] │ + │ -4 ┆ -3.25 ┆ (-5, -3.25] │ + │ -3 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1 ┆ inf ┆ (0.25, inf] │ + │ 2 ┆ inf ┆ (0.25, inf] │ + └─────┴─────────────┴───────────────┘ + >>> a.qcut([0.0, 0.25, 0.75], series=True) + shape: (8,) + Series: \'a\' [cat] + [ + "(-inf, -5]" + "(-5, -3.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(0.25, inf]" + "(0.25, inf]" + ] + >>> a.qcut([0.0, 0.25, 0.75], series=True, left_closed=True) + shape: (8,) + Series: \'a\' [cat] + [ + "[-5, -3.25)" + "[-5, -3.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[0.25, inf)" + "[0.25, inf)" + ] + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and ``append`` will change to always + behave like ``append_chunks=True`` (the previous default). For the + behavior of ``append_chunks=False``, use ``Series.extend``. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from ``append``, which adds the chunks from ``other`` to the chunks of + this series, ``extend`` appends the data from ``other`` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``append`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer ``append`` over ``extend`` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single ``Series``. In the latter case, finish the sequence + of ``append`` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point ``nan`` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set ``zero_copy_only=True``. + + Alternatively, if you want a zero-copy view and know what you are doing, + use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + Series + The mutated series. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/dataframe/frame deleted file mode 100644 index 1e5d447..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/dataframe/frame +++ /dev/null @@ -1,286 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase, TextIOWrapper -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Null as Null, Object as Object, Struct as Struct, Time as Time, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.selectors import _expand_selectors as _expand_selectors -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnNameOrSelector as ColumnNameOrSelector, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IndexOrder as IndexOrder, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_renamed_methods as deprecate_renamed_methods, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, ClassVar, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: ClassVar[set[str]] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ..., ignore_errors: bool = ...) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def flags(self) -> dict[str, dict[str, bool]]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ..., *, order: IndexOrder = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ..., freeze_panes: str | tuple[int, int] | tuple[str, int, int] | tuple[int, int, int, int] | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, Any] | None = ...) -> None: ... - def write_database(self, table_name: str, connection: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: str | Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, other: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: DataFrame) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., *, separator: str = ..., drop_first: bool = ...) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *, named: bool = ..., include_key: bool = ..., unique: bool = ...) -> dict[Any, Iterable[Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def corr(self, **kwargs: Any) -> DataFrame: ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/dataframe/frame.pyi new file mode 100644 index 0000000..6284f1a --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/dataframe/frame.pyi @@ -0,0 +1,6219 @@ +#: version 0.18.13 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Null as Null, Object as Object, Struct as Struct, Time as Time, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_renamed_methods as deprecate_renamed_methods, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + ``nan_as_null`` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + Polars currently relies on pyarrow's implementation of the dataframe interchange + protocol. Therefore, pyarrow>=11.0.0 is required for this method to work. + + Because Polars can not currently guarantee zero-copy conversion to Arrow for + categorical columns, ``allow_copy=False`` will not work if the dataframe + contains categorical data. + + """ + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + ``structured`` is set to ``False`` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + If you pass ``partition_cols`` here, the dataset will be written + using ``pyarrow.parquet.write_to_dataset``. + The ``partition_cols`` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, *args, **kwargs) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Note: Some polars data types like `Null`, `Categorical` and `Time` are + not supported by the delta protocol specification. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Examples + -------- + Instantiate a basic dataframe: + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + + Write DataFrame as a Delta Lake table on local filesystem. + + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on local filesystem. + Note: This will fail if schema of the new data does not match the + schema of existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + Note: If the schema of the new and old data is same, + then setting `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table on cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, *args, **kwargs) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from ``vstack`` which adds the chunks from ``other`` to the chunks of + this ``DataFrame``, ``extend`` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``vstack`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer ``vstack`` over ``extend`` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single ``DataFrame``. In the latter case, finish the sequence of + ``vstack`` operations with a ``rechunk``. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using ``iter_rows`` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy ``corrcoef``. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/expr/expr deleted file mode 100644 index c2be9ec..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/expr/expr +++ /dev/null @@ -1,265 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.exceptions import PolarsInefficientApplyWarning as PolarsInefficientApplyWarning -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: ClassVar[set[str]] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self, drop_nulls: bool = ...) -> Self: ... - def all(self, drop_nulls: bool = ...) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def cbrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def keep_name(self) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: IntoExpr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def cut(self, breaks: list[float], labels: list[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: ... - def qcut(self, quantiles: list[float] | int, labels: list[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> Self: ... - def rle(self) -> Self: ... - def rle_id(self) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | None | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def degrees(self) -> Self: ... - def radians(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ..., fixed_seed: bool = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ..., fixed_seed: bool = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/expr/expr.pyi new file mode 100644 index 0000000..a414f1f --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/expr/expr.pyi @@ -0,0 +1,7702 @@ +#: version 0.18.13 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientApplyWarning as PolarsInefficientApplyWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self, drop_nulls: bool = ...) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Parameters + ---------- + drop_nulls + If False, return None if there are nulls but no Trues. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + >>> df = pl.DataFrame(dict(x=[None, False], y=[None, True])) + >>> df.select(pl.col("x").any(True), pl.col("y").any(True)) + shape: (1, 2) + ┌───────┬──────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪══════╡ + │ false ┆ true │ + └───────┴──────┘ + >>> df.select(pl.col("x").any(False), pl.col("y").any(False)) + shape: (1, 2) + ┌──────┬──────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪══════╡ + │ null ┆ true │ + └──────┴──────┘ + + ''' + def all(self, drop_nulls: bool = ...) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Parameters + ---------- + drop_nulls + If False, return None if there are any nulls. + + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + >>> df = pl.DataFrame(dict(x=[None, False], y=[None, True])) + >>> df.select(pl.col("x").all(True), pl.col("y").all(True)) + shape: (1, 2) + ┌───────┬───────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + └───────┴───────┘ + >>> df.select(pl.col("x").all(False), pl.col("y").all(False)) + shape: (1, 2) + ┌──────┬──────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪══════╡ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().map_alias(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").keep_name()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: list[float], labels: list[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + A list of unique cut points. + labels + Labels to assign to bins. If given, the length must be len(breaks) + 1. + left_closed + Whether intervals should be [) instead of the default of (] + include_breaks + Include the the right endpoint of the bin each observation falls in. + If True, the resulting column will be a Struct. + + Examples + -------- + >>> g = pl.repeat("a", 5, eager=True).append(pl.repeat("b", 5, eager=True)) + >>> df = pl.DataFrame(dict(g=g, x=range(10))) + >>> df.with_columns(q=pl.col("x").cut([2, 5])) + shape: (10, 3) + ┌─────┬─────┬───────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═══════════╡ + │ a ┆ 0 ┆ (-inf, 2] │ + │ a ┆ 1 ┆ (-inf, 2] │ + │ a ┆ 2 ┆ (-inf, 2] │ + │ a ┆ 3 ┆ (2, 5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (5, inf] │ + │ b ┆ 7 ┆ (5, inf] │ + │ b ┆ 8 ┆ (5, inf] │ + │ b ┆ 9 ┆ (5, inf] │ + └─────┴─────┴───────────┘ + >>> df.with_columns(q=pl.col("x").cut([2, 5], left_closed=True)) + shape: (10, 3) + ┌─────┬─────┬───────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═══════════╡ + │ a ┆ 0 ┆ [-inf, 2) │ + │ a ┆ 1 ┆ [-inf, 2) │ + │ a ┆ 2 ┆ [2, 5) │ + │ a ┆ 3 ┆ [2, 5) │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ [5, inf) │ + │ b ┆ 7 ┆ [5, inf) │ + │ b ┆ 8 ┆ [5, inf) │ + │ b ┆ 9 ┆ [5, inf) │ + └─────┴─────┴───────────┘ + ''' + def qcut(self, *args, **kwargs) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of evenly spaced probabilities to use. + labels + Labels to assign to bins. If given, the length must be len(probs) + 1. + If computing over groups this must be set for now. + left_closed + Whether intervals should be [) instead of the default of (] + allow_duplicates + If True, the resulting quantile breaks don\'t have to be unique. This can + happen even with unique probs depending on the data. Duplicates will be + dropped, resulting in fewer bins. + include_breaks + Include the the right endpoint of the bin each observation falls in. + If True, the resulting column will be a Struct. + + Examples + -------- + >>> g = pl.repeat("a", 5, eager=True).append(pl.repeat("b", 5, eager=True)) + >>> df = pl.DataFrame(dict(g=g, x=range(10))) + >>> df.with_columns(q=pl.col("x").qcut([0.5])) + shape: (10, 3) + ┌─────┬─────┬─────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════════════╡ + │ a ┆ 0 ┆ (-inf, 4.5] │ + │ a ┆ 1 ┆ (-inf, 4.5] │ + │ a ┆ 2 ┆ (-inf, 4.5] │ + │ a ┆ 3 ┆ (-inf, 4.5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (4.5, inf] │ + │ b ┆ 7 ┆ (4.5, inf] │ + │ b ┆ 8 ┆ (4.5, inf] │ + │ b ┆ 9 ┆ (4.5, inf] │ + └─────┴─────┴─────────────┘ + >>> df.with_columns(q=pl.col("x").qcut(2)) + shape: (10, 3) + ┌─────┬─────┬─────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════════════╡ + │ a ┆ 0 ┆ (-inf, 4.5] │ + │ a ┆ 1 ┆ (-inf, 4.5] │ + │ a ┆ 2 ┆ (-inf, 4.5] │ + │ a ┆ 3 ┆ (-inf, 4.5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (4.5, inf] │ + │ b ┆ 7 ┆ (4.5, inf] │ + │ b ┆ 8 ┆ (4.5, inf] │ + │ b ┆ 9 ┆ (4.5, inf] │ + └─────┴─────┴─────────────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.5], ["lo", "hi"]).over("g")) + shape: (10, 3) + ┌─────┬─────┬─────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ lo │ + │ a ┆ 1 ┆ lo │ + │ a ┆ 2 ┆ lo │ + │ a ┆ 3 ┆ hi │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ lo │ + │ b ┆ 7 ┆ lo │ + │ b ┆ 8 ┆ hi │ + │ b ┆ 9 ┆ hi │ + └─────┴─────┴─────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.5], ["lo", "hi"], True).over("g")) + shape: (10, 3) + ┌─────┬─────┬─────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ lo │ + │ a ┆ 1 ┆ lo │ + │ a ┆ 2 ┆ hi │ + │ a ┆ 3 ┆ hi │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ lo │ + │ b ┆ 7 ┆ hi │ + │ b ┆ 8 ┆ hi │ + │ b ┆ 9 ┆ hi │ + └─────┴─────┴─────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.25, 0.5], include_breaks=True)) + shape: (10, 3) + ┌─────┬─────┬───────────────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ struct[2] │ + ╞═════╪═════╪═══════════════════════╡ + │ a ┆ 0 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 1 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 2 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 3 ┆ {4.5,"(2.25, 4.5]"} │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 7 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 8 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 9 ┆ {inf,"(4.5, inf]"} │ + └─────┴─────┴───────────────────────┘ + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq_missing(pl.col("y")).alias("x == y"), + ... ) + shape: (6, 3) + ┌──────┬──────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞══════╪══════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + │ null ┆ 5.0 ┆ false │ + │ null ┆ null ┆ true │ + └──────┴──────┴────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne_missing(pl.col("y")).alias("x != y"), + ... ) + shape: (6, 3) + ┌──────┬──────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞══════╪══════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + │ null ┆ 5.0 ┆ true │ + │ null ┆ null ┆ false │ + └──────┴──────┴────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, *args, **kwargs) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, *args, **kwargs) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, *args, **kwargs) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, *args, **kwargs) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, *args, **kwargs) -> Self: + ''' + Compute a rolling standard deviation. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, *args, **kwargs) -> Self: + ''' + Compute a rolling variance. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, *args, **kwargs) -> Self: + ''' + Compute a rolling median. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, *args, **kwargs) -> Self: + ''' + Compute a rolling quantile. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f32 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ..., fixed_seed: bool = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + fixed_seed + If True, The seed will not be incremented between draws. + This can make output predictable because draw ordering can + change due to threads being scheduled in a different order. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + fixed_seed + If True, The seed will not be incremented between draws. + This can make output predictable because draw ordering can + change due to threads being scheduled in a different order. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Expr + Expression of data type :class:`Struct`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self, *args, **kwargs) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/lazyframe/frame deleted file mode 100644 index 9279b81..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/lazyframe/frame +++ /dev/null @@ -1,144 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import dataframe_api_compat as dataframe_api_compat, subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyExpr as PyExpr, PyLazyFrame as PyLazyFrame -from polars.selectors import _expand_selectors as _expand_selectors -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, ColumnNameOrSelector as ColumnNameOrSelector, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_methods as deprecate_renamed_methods, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath -from typing import Any, Callable, ClassVar, Collection, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -def _prepare_select(*exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> list[PyExpr]: ... - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: ClassVar[set[str]] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def deserialize(cls, source: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def _comparison_error(self, operator: str) -> NoReturn: ... - def __eq__(self, other: Any) -> NoReturn: ... - def __ne__(self, other: Any) -> NoReturn: ... - def __gt__(self, other: Any) -> NoReturn: ... - def __lt__(self, other: Any) -> NoReturn: ... - def __ge__(self, other: Any) -> NoReturn: ... - def __le__(self, other: Any) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def serialize(self, file: None = ...) -> str: ... - @overload - def serialize(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | Path | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, predicate: IntoExpr) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other: Self | list[Self]) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: ... - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..01b2b3a --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/lazyframe/frame.pyi @@ -0,0 +1,3580 @@ +#: version 0.18.13 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_methods as deprecate_renamed_methods, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int +def _prepare_select(*exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> list[PyExpr]: ... + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, *args, **kwargs) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to ``StringIO`` + and then use ``LazyFrame.deserialize``. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, *args, **kwargs) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to ``deserialize``. + + Parameters + ---------- + source + Path to a file or a file-like object. + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"LocalProjection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self, *args, **kwargs) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self, *args, **kwargs) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self, *args, **kwargs) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self, *args, **kwargs) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, *args, **kwargs) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: IntoExpr) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.groupby_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_rolling + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/series/series deleted file mode 100644 index f6e2e59..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/series/series +++ /dev/null @@ -1,346 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.deprecation import deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Generator, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: ClassVar[set[str]] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Self: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Self: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Self: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Self: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Self: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - @overload - def __floordiv__(self, other: Expr) -> Expr: ... - @overload - def __floordiv__(self, other: Any) -> Series: ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def __column_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def cbrt(self) -> Series: ... - def any(self, drop_nulls: bool = ...) -> bool | None: ... - def all(self, drop_nulls: bool = ...) -> bool | None: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | None | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, breaks: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, series: bool = ..., left_closed: bool = ..., include_breaks: bool = ...) -> DataFrame | Series: ... - def qcut(self, quantiles: list[float] | int, *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., series: bool = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> DataFrame | Series: ... - def rle(self) -> Series: ... - def rle_id(self) -> Series: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool | None = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool | None = ...) -> Self: ... - def extend(self, other: Series) -> Self: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series[Any]: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/series/series.pyi new file mode 100644 index 0000000..454356a --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.13/polars/series/series.pyi @@ -0,0 +1,4390 @@ +#: version 0.18.13 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.deprecation import deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self | Expr: ... + def __ne__(self, other: Any) -> Self | Expr: ... + def __gt__(self, other: Any) -> Self | Expr: ... + def __lt__(self, other: Any) -> Self | Expr: ... + def __ge__(self, other: Any) -> Self | Expr: ... + def __le__(self, other: Any) -> Self | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self, drop_nulls: bool = ...) -> bool | None: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def all(self, drop_nulls: bool = ...) -> bool | None: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ + │ 25% ┆ 2.0 │ + │ 75% ┆ 4.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, *args, **kwargs) -> DataFrame | Series: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + A list of unique cut points. + labels + Labels to assign to the bins. If given the length of labels must be + len(breaks) + 1. + break_point_label + Name given to the breakpoint column/field. Only used if series == False or + include_breaks == True + category_label + Name given to the category column. Only used if series == False + series + If True, return a categorical Series in the data\'s original order. + left_closed + Whether intervals should be [) instead of (] + include_breaks + Include the the right endpoint of the bin each observation falls in. + If returning a DataFrame, it will be a column, and if returning a Series + it will be a field in a Struct + + Returns + ------- + DataFrame or Series + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1], series=False) + shape: (12, 3) + ┌──────┬─────────────┬────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1, 1] │ + │ 1.5 ┆ inf ┆ (1, inf] │ + │ 2.0 ┆ inf ┆ (1, inf] │ + │ 2.5 ┆ inf ┆ (1, inf] │ + └──────┴─────────────┴────────────┘ + >>> a.cut([-1, 1], series=True) + shape: (12,) + Series: \'a\' [cat] + [ + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-1, 1]" + "(-1, 1]" + "(-1, 1]" + "(-1, 1]" + "(1, inf]" + "(1, inf]" + "(1, inf]" + ] + >>> a.cut([-1, 1], series=True, left_closed=True) + shape: (12,) + Series: \'a\' [cat] + [ + "[-inf, -1)" + "[-inf, -1)" + "[-inf, -1)" + "[-inf, -1)" + "[-1, 1)" + "[-1, 1)" + "[-1, 1)" + "[-1, 1)" + "[1, inf)" + "[1, inf)" + "[1, inf)" + "[1, inf)" + ] + ''' + def qcut(self, *args, **kwargs) -> DataFrame | Series: + ''' + Discretize continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of evenly spaced probabilities to use. + labels + Labels to assign to the quantiles. If given the length of labels must be + len(breaks) + 1. + break_point_label + Name given to the breakpoint column/field. Only used if series == False or + include_breaks == True + category_label + Name given to the category column. Only used if series == False. + series + If True, return a categorical Series in the data\'s original order + left_closed + Whether intervals should be [) instead of (] + allow_duplicates + If True, the resulting quantile breaks don\'t have to be unique. This can + happen even with unique probs depending on the data. Duplicates will be + dropped, resulting in fewer bins. + include_breaks + Include the the right endpoint of the bin each observation falls in. + If returning a DataFrame, it will be a column, and if returning a Series + it will be a field in a Struct + + Returns + ------- + DataFrame or Series + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut(2, series=True) + shape: (8,) + Series: \'a\' [cat] + [ + "(-inf, -1.5]" + "(-inf, -1.5]" + "(-inf, -1.5]" + "(-inf, -1.5]" + "(-1.5, inf]" + "(-1.5, inf]" + "(-1.5, inf]" + "(-1.5, inf]" + ] + >>> a.qcut([0.0, 0.25, 0.75], series=False) + shape: (8, 3) + ┌─────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪═══════════════╡ + │ -5 ┆ -5.0 ┆ (-inf, -5] │ + │ -4 ┆ -3.25 ┆ (-5, -3.25] │ + │ -3 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1 ┆ inf ┆ (0.25, inf] │ + │ 2 ┆ inf ┆ (0.25, inf] │ + └─────┴─────────────┴───────────────┘ + >>> a.qcut([0.0, 0.25, 0.75], series=True) + shape: (8,) + Series: \'a\' [cat] + [ + "(-inf, -5]" + "(-5, -3.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(0.25, inf]" + "(0.25, inf]" + ] + >>> a.qcut([0.0, 0.25, 0.75], series=True, left_closed=True) + shape: (8,) + Series: \'a\' [cat] + [ + "[-5, -3.25)" + "[-5, -3.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[0.25, inf)" + "[0.25, inf)" + ] + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and ``append`` will change to always + behave like ``append_chunks=True`` (the previous default). For the + behavior of ``append_chunks=False``, use ``Series.extend``. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from ``append``, which adds the chunks from ``other`` to the chunks of + this series, ``extend`` appends the data from ``other`` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``append`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer ``append`` over ``extend`` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single ``Series``. In the latter case, finish the sequence + of ``append`` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point ``nan`` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set ``zero_copy_only=True``. + + Alternatively, if you want a zero-copy view and know what you are doing, + use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + Series + The mutated series. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/dataframe/frame deleted file mode 100644 index 1c53d6c..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/dataframe/frame +++ /dev/null @@ -1,286 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase, TextIOWrapper -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions import col as col, lit as lit -from polars.interchange.dataframe import PolarsDataFrame as PolarsDataFrame -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.selectors import _expand_selectors as _expand_selectors -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnNameOrSelector as ColumnNameOrSelector, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IndexOrder as IndexOrder, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_renamed_methods as deprecate_renamed_methods, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from typing import Any, BinaryIO, Callable, ClassVar, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: ClassVar[set[str]] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ..., raise_if_empty: bool = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ..., ignore_errors: bool = ...) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def flags(self) -> dict[str, dict[str, bool]]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ..., *, order: IndexOrder = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path, *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., header_format: dict[str, Any] | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ..., freeze_panes: str | tuple[int, int] | tuple[str, int, int] | tuple[int, int, int, int] | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, Any] | None = ...) -> None: ... - def write_database(self, table_name: str, connection: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: str | Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, other: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: DataFrame) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., *, separator: str = ..., drop_first: bool = ...) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *, named: bool = ..., include_key: bool = ..., unique: bool = ...) -> dict[Any, Iterable[Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def corr(self, **kwargs: Any) -> DataFrame: ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/dataframe/frame.pyi new file mode 100644 index 0000000..4e11d23 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/dataframe/frame.pyi @@ -0,0 +1,6285 @@ +#: version 0.18.15 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_renamed_methods as deprecate_renamed_methods, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to ``True`` will raise a ``NotImplementedError``. + allow_copy + Allow memory to be copied to perform the conversion. If set to ``False``, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars dataframe to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + ``structured`` is set to ``False`` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A ``{key:value,}`` dictionary of ``xlsxwriter`` format options to apply + to the table header row, such as ``{"bold":True, "font_color":"#702963"}``. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + + Notes + ----- + * A list of compatible ``xlsxwriter`` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + If you pass ``partition_cols`` here, the dataset will be written + using ``pyarrow.parquet.write_to_dataset``. + The ``partition_cols`` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, *args, **kwargs) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Some other data types are not supported but have an associated `primitive type + `__ + to which they can be cast. This affects the following data types: + + - Unsigned integers + - :class:`Datetime` types with millisecond or nanosecond precision + - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a dataframe as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, *args, **kwargs) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from ``vstack`` which adds the chunks from ``other`` to the chunks of + this ``DataFrame``, ``extend`` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``vstack`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer ``vstack`` over ``extend`` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single ``DataFrame``. In the latter case, finish the sequence of + ``vstack`` operations with a ``rechunk``. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using ``iter_rows`` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy ``corrcoef``. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/expr/expr deleted file mode 100644 index c2be9ec..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/expr/expr +++ /dev/null @@ -1,265 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.exceptions import PolarsInefficientApplyWarning as PolarsInefficientApplyWarning -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: ClassVar[set[str]] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self, drop_nulls: bool = ...) -> Self: ... - def all(self, drop_nulls: bool = ...) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def cbrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def keep_name(self) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: IntoExpr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def cut(self, breaks: list[float], labels: list[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: ... - def qcut(self, quantiles: list[float] | int, labels: list[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> Self: ... - def rle(self) -> Self: ... - def rle_id(self) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | None | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def degrees(self) -> Self: ... - def radians(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ..., fixed_seed: bool = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ..., fixed_seed: bool = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/expr/expr.pyi new file mode 100644 index 0000000..51f1a9d --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/expr/expr.pyi @@ -0,0 +1,7702 @@ +#: version 0.18.15 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientApplyWarning as PolarsInefficientApplyWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self, drop_nulls: bool = ...) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Parameters + ---------- + drop_nulls + If False, return None if there are nulls but no Trues. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + >>> df = pl.DataFrame(dict(x=[None, False], y=[None, True])) + >>> df.select(pl.col("x").any(True), pl.col("y").any(True)) + shape: (1, 2) + ┌───────┬──────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪══════╡ + │ false ┆ true │ + └───────┴──────┘ + >>> df.select(pl.col("x").any(False), pl.col("y").any(False)) + shape: (1, 2) + ┌──────┬──────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪══════╡ + │ null ┆ true │ + └──────┴──────┘ + + ''' + def all(self, drop_nulls: bool = ...) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Parameters + ---------- + drop_nulls + If False, return None if there are any nulls. + + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + >>> df = pl.DataFrame(dict(x=[None, False], y=[None, True])) + >>> df.select(pl.col("x").all(True), pl.col("y").all(True)) + shape: (1, 2) + ┌───────┬───────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + └───────┴───────┘ + >>> df.select(pl.col("x").all(False), pl.col("y").all(False)) + shape: (1, 2) + ┌──────┬──────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪══════╡ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().map_alias(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").keep_name()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: list[float], labels: list[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + A list of unique cut points. + labels + Labels to assign to bins. If given, the length must be len(breaks) + 1. + left_closed + Whether intervals should be [) instead of the default of (] + include_breaks + Include the the right endpoint of the bin each observation falls in. + If True, the resulting column will be a Struct. + + Examples + -------- + >>> g = pl.repeat("a", 5, eager=True).append(pl.repeat("b", 5, eager=True)) + >>> df = pl.DataFrame(dict(g=g, x=range(10))) + >>> df.with_columns(q=pl.col("x").cut([2, 5])) + shape: (10, 3) + ┌─────┬─────┬───────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═══════════╡ + │ a ┆ 0 ┆ (-inf, 2] │ + │ a ┆ 1 ┆ (-inf, 2] │ + │ a ┆ 2 ┆ (-inf, 2] │ + │ a ┆ 3 ┆ (2, 5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (5, inf] │ + │ b ┆ 7 ┆ (5, inf] │ + │ b ┆ 8 ┆ (5, inf] │ + │ b ┆ 9 ┆ (5, inf] │ + └─────┴─────┴───────────┘ + >>> df.with_columns(q=pl.col("x").cut([2, 5], left_closed=True)) + shape: (10, 3) + ┌─────┬─────┬───────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═══════════╡ + │ a ┆ 0 ┆ [-inf, 2) │ + │ a ┆ 1 ┆ [-inf, 2) │ + │ a ┆ 2 ┆ [2, 5) │ + │ a ┆ 3 ┆ [2, 5) │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ [5, inf) │ + │ b ┆ 7 ┆ [5, inf) │ + │ b ┆ 8 ┆ [5, inf) │ + │ b ┆ 9 ┆ [5, inf) │ + └─────┴─────┴───────────┘ + ''' + def qcut(self, *args, **kwargs) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of evenly spaced probabilities to use. + labels + Labels to assign to bins. If given, the length must be len(probs) + 1. + If computing over groups this must be set for now. + left_closed + Whether intervals should be [) instead of the default of (] + allow_duplicates + If True, the resulting quantile breaks don\'t have to be unique. This can + happen even with unique probs depending on the data. Duplicates will be + dropped, resulting in fewer bins. + include_breaks + Include the the right endpoint of the bin each observation falls in. + If True, the resulting column will be a Struct. + + Examples + -------- + >>> g = pl.repeat("a", 5, eager=True).append(pl.repeat("b", 5, eager=True)) + >>> df = pl.DataFrame(dict(g=g, x=range(10))) + >>> df.with_columns(q=pl.col("x").qcut([0.5])) + shape: (10, 3) + ┌─────┬─────┬─────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════════════╡ + │ a ┆ 0 ┆ (-inf, 4.5] │ + │ a ┆ 1 ┆ (-inf, 4.5] │ + │ a ┆ 2 ┆ (-inf, 4.5] │ + │ a ┆ 3 ┆ (-inf, 4.5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (4.5, inf] │ + │ b ┆ 7 ┆ (4.5, inf] │ + │ b ┆ 8 ┆ (4.5, inf] │ + │ b ┆ 9 ┆ (4.5, inf] │ + └─────┴─────┴─────────────┘ + >>> df.with_columns(q=pl.col("x").qcut(2)) + shape: (10, 3) + ┌─────┬─────┬─────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════════════╡ + │ a ┆ 0 ┆ (-inf, 4.5] │ + │ a ┆ 1 ┆ (-inf, 4.5] │ + │ a ┆ 2 ┆ (-inf, 4.5] │ + │ a ┆ 3 ┆ (-inf, 4.5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (4.5, inf] │ + │ b ┆ 7 ┆ (4.5, inf] │ + │ b ┆ 8 ┆ (4.5, inf] │ + │ b ┆ 9 ┆ (4.5, inf] │ + └─────┴─────┴─────────────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.5], ["lo", "hi"]).over("g")) + shape: (10, 3) + ┌─────┬─────┬─────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ lo │ + │ a ┆ 1 ┆ lo │ + │ a ┆ 2 ┆ lo │ + │ a ┆ 3 ┆ hi │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ lo │ + │ b ┆ 7 ┆ lo │ + │ b ┆ 8 ┆ hi │ + │ b ┆ 9 ┆ hi │ + └─────┴─────┴─────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.5], ["lo", "hi"], True).over("g")) + shape: (10, 3) + ┌─────┬─────┬─────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ lo │ + │ a ┆ 1 ┆ lo │ + │ a ┆ 2 ┆ hi │ + │ a ┆ 3 ┆ hi │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ lo │ + │ b ┆ 7 ┆ hi │ + │ b ┆ 8 ┆ hi │ + │ b ┆ 9 ┆ hi │ + └─────┴─────┴─────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.25, 0.5], include_breaks=True)) + shape: (10, 3) + ┌─────┬─────┬───────────────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ struct[2] │ + ╞═════╪═════╪═══════════════════════╡ + │ a ┆ 0 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 1 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 2 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 3 ┆ {4.5,"(2.25, 4.5]"} │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 7 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 8 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 9 ┆ {inf,"(4.5, inf]"} │ + └─────┴─────┴───────────────────────┘ + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq_missing(pl.col("y")).alias("x == y"), + ... ) + shape: (6, 3) + ┌──────┬──────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞══════╪══════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + │ null ┆ 5.0 ┆ false │ + │ null ┆ null ┆ true │ + └──────┴──────┴────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne_missing(pl.col("y")).alias("x != y"), + ... ) + shape: (6, 3) + ┌──────┬──────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞══════╪══════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + │ null ┆ 5.0 ┆ true │ + │ null ┆ null ┆ false │ + └──────┴──────┴────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, *args, **kwargs) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, *args, **kwargs) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, *args, **kwargs) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, *args, **kwargs) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, *args, **kwargs) -> Self: + ''' + Compute a rolling standard deviation. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, *args, **kwargs) -> Self: + ''' + Compute a rolling variance. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, *args, **kwargs) -> Self: + ''' + Compute a rolling median. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, *args, **kwargs) -> Self: + ''' + Compute a rolling quantile. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f32 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ..., fixed_seed: bool = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + fixed_seed + If True, The seed will not be incremented between draws. + This can make output predictable because draw ordering can + change due to threads being scheduled in a different order. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + fixed_seed + If True, The seed will not be incremented between draws. + This can make output predictable because draw ordering can + change due to threads being scheduled in a different order. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Expr + Expression of data type :class:`Struct`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self, *args, **kwargs) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/lazyframe/frame deleted file mode 100644 index f86511d..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/lazyframe/frame +++ /dev/null @@ -1,144 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import dataframe_api_compat as dataframe_api_compat, subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyExpr as PyExpr, PyLazyFrame as PyLazyFrame -from polars.selectors import _expand_selectors as _expand_selectors -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, ColumnNameOrSelector as ColumnNameOrSelector, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_methods as deprecate_renamed_methods, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath -from typing import Any, Callable, ClassVar, Collection, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -def _prepare_select(*exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> list[PyExpr]: ... - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: ClassVar[set[str]] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ..., raise_if_empty: bool = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def deserialize(cls, source: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def _comparison_error(self, operator: str) -> NoReturn: ... - def __eq__(self, other: Any) -> NoReturn: ... - def __ne__(self, other: Any) -> NoReturn: ... - def __gt__(self, other: Any) -> NoReturn: ... - def __lt__(self, other: Any) -> NoReturn: ... - def __ge__(self, other: Any) -> NoReturn: ... - def __le__(self, other: Any) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def serialize(self, file: None = ...) -> str: ... - @overload - def serialize(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | Path | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, predicate: IntoExpr) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other: Self | list[Self]) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: ... - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..5d41cfb --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/lazyframe/frame.pyi @@ -0,0 +1,3584 @@ +#: version 0.18.15 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_methods as deprecate_renamed_methods, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int +def _prepare_select(*exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> list[PyExpr]: ... + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, *args, **kwargs) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to ``StringIO`` + and then use ``LazyFrame.deserialize``. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, *args, **kwargs) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to ``deserialize``. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"LocalProjection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self, *args, **kwargs) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self, *args, **kwargs) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self, *args, **kwargs) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self, *args, **kwargs) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, *args, **kwargs) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: IntoExpr) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.groupby_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_rolling + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def __getattr__(self, obj: T, item: Any) -> Any: ... + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/series/series deleted file mode 100644 index 8a2dbae..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/series/series +++ /dev/null @@ -1,346 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.deprecation import deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Generator, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: ClassVar[set[str]] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> tuple[int, int, int]: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Self: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Self: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Self: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Self: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Self: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - @overload - def __floordiv__(self, other: Expr) -> Expr: ... - @overload - def __floordiv__(self, other: Any) -> Series: ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def __column_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def cbrt(self) -> Series: ... - def any(self, drop_nulls: bool = ...) -> bool | None: ... - def all(self, drop_nulls: bool = ...) -> bool | None: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | None | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, breaks: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, series: bool = ..., left_closed: bool = ..., include_breaks: bool = ...) -> DataFrame | Series: ... - def qcut(self, quantiles: list[float] | int, *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., series: bool = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> DataFrame | Series: ... - def rle(self) -> Series: ... - def rle_id(self) -> Series: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool | None = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool | None = ...) -> Self: ... - def extend(self, other: Series) -> Self: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series[Any]: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/series/series.pyi new file mode 100644 index 0000000..9005e09 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.15/polars/series/series.pyi @@ -0,0 +1,4392 @@ +#: version 0.18.15 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.deprecation import deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self | Expr: ... + def __ne__(self, other: Any) -> Self | Expr: ... + def __gt__(self, other: Any) -> Self | Expr: ... + def __lt__(self, other: Any) -> Self | Expr: ... + def __ge__(self, other: Any) -> Self | Expr: ... + def __le__(self, other: Any) -> Self | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self, drop_nulls: bool = ...) -> bool | None: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def all(self, drop_nulls: bool = ...) -> bool | None: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, *args, **kwargs) -> DataFrame | Series: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + A list of unique cut points. + labels + Labels to assign to the bins. If given the length of labels must be + len(breaks) + 1. + break_point_label + Name given to the breakpoint column/field. Only used if series == False or + include_breaks == True + category_label + Name given to the category column. Only used if series == False + series + If True, return a categorical Series in the data\'s original order. + left_closed + Whether intervals should be [) instead of (] + include_breaks + Include the the right endpoint of the bin each observation falls in. + If returning a DataFrame, it will be a column, and if returning a Series + it will be a field in a Struct + + Returns + ------- + DataFrame or Series + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1], series=False) + shape: (12, 3) + ┌──────┬─────────────┬────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1, 1] │ + │ 1.5 ┆ inf ┆ (1, inf] │ + │ 2.0 ┆ inf ┆ (1, inf] │ + │ 2.5 ┆ inf ┆ (1, inf] │ + └──────┴─────────────┴────────────┘ + >>> a.cut([-1, 1], series=True) + shape: (12,) + Series: \'a\' [cat] + [ + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-1, 1]" + "(-1, 1]" + "(-1, 1]" + "(-1, 1]" + "(1, inf]" + "(1, inf]" + "(1, inf]" + ] + >>> a.cut([-1, 1], series=True, left_closed=True) + shape: (12,) + Series: \'a\' [cat] + [ + "[-inf, -1)" + "[-inf, -1)" + "[-inf, -1)" + "[-inf, -1)" + "[-1, 1)" + "[-1, 1)" + "[-1, 1)" + "[-1, 1)" + "[1, inf)" + "[1, inf)" + "[1, inf)" + "[1, inf)" + ] + ''' + def qcut(self, *args, **kwargs) -> DataFrame | Series: + ''' + Discretize continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of evenly spaced probabilities to use. + labels + Labels to assign to the quantiles. If given the length of labels must be + len(breaks) + 1. + break_point_label + Name given to the breakpoint column/field. Only used if series == False or + include_breaks == True + category_label + Name given to the category column. Only used if series == False. + series + If True, return a categorical Series in the data\'s original order + left_closed + Whether intervals should be [) instead of (] + allow_duplicates + If True, the resulting quantile breaks don\'t have to be unique. This can + happen even with unique probs depending on the data. Duplicates will be + dropped, resulting in fewer bins. + include_breaks + Include the the right endpoint of the bin each observation falls in. + If returning a DataFrame, it will be a column, and if returning a Series + it will be a field in a Struct + + Returns + ------- + DataFrame or Series + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut(2, series=True) + shape: (8,) + Series: \'a\' [cat] + [ + "(-inf, -1.5]" + "(-inf, -1.5]" + "(-inf, -1.5]" + "(-inf, -1.5]" + "(-1.5, inf]" + "(-1.5, inf]" + "(-1.5, inf]" + "(-1.5, inf]" + ] + >>> a.qcut([0.0, 0.25, 0.75], series=False) + shape: (8, 3) + ┌─────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪═══════════════╡ + │ -5 ┆ -5.0 ┆ (-inf, -5] │ + │ -4 ┆ -3.25 ┆ (-5, -3.25] │ + │ -3 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1 ┆ inf ┆ (0.25, inf] │ + │ 2 ┆ inf ┆ (0.25, inf] │ + └─────┴─────────────┴───────────────┘ + >>> a.qcut([0.0, 0.25, 0.75], series=True) + shape: (8,) + Series: \'a\' [cat] + [ + "(-inf, -5]" + "(-5, -3.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(0.25, inf]" + "(0.25, inf]" + ] + >>> a.qcut([0.0, 0.25, 0.75], series=True, left_closed=True) + shape: (8,) + Series: \'a\' [cat] + [ + "[-5, -3.25)" + "[-5, -3.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[0.25, inf)" + "[0.25, inf)" + ] + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and ``append`` will change to always + behave like ``append_chunks=True`` (the previous default). For the + behavior of ``append_chunks=False``, use ``Series.extend``. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from ``append``, which adds the chunks from ``other`` to the chunks of + this series, ``extend`` appends the data from ``other`` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``append`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer ``append`` over ``extend`` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single ``Series``. In the latter case, finish the sequence + of ``append`` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point ``nan`` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set ``zero_copy_only=True``. + + Alternatively, if you want a zero-copy view and know what you are doing, + use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + Series + The mutated series. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/dataframe/frame deleted file mode 100644 index e2fbf8e..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/dataframe/frame +++ /dev/null @@ -1,282 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Null as Null, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, Struct as Struct, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: set[str] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @columns.setter - def columns(self, names: Sequence[str]) -> None: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: str = ...) -> DataFrame: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ...) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs): ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/dataframe/frame.pyi new file mode 100644 index 0000000..ee647ba --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/dataframe/frame.pyi @@ -0,0 +1,5759 @@ +#: version 0.18.2 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Null as Null, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, Struct as Struct, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to append to or create in the SQL database. + connection_uri + Connection uri, for example + + * "postgresql://username:password@server:port/database" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Note: Some polars data types like `Null`, `Categorical` and `Time` are + not supported by the delta protocol specification. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Examples + -------- + Instantiate a basic dataframe: + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + + Write DataFrame as a Delta Lake table on local filesystem. + + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on local filesystem. + Note: This will fail if schema of the new data does not match the + schema of existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + Note: If the schema of the new and old data is same, + then setting `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table on cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable that yields column names. Will be used to + replace the columns in the DataFrame. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + - \'window\': Truncate the start of the window with the \'every\' argument. + - \'datapoint\': Start from the first encountered data point. + - \'monday\': Start the window on the monday before the first data point. + - \'tuesday\': Start the window on the tuesday before the first data point. + - ... + - \'sunday\': Start the window on the sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'m:m\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + It is better to implement this with an expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a Series by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, df: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + df + DataFrame to stack. + in_place + Modify in place + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: Self) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this + `DataFrame` `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. + For instance during online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a + query. For instance when you read in multiple files and when to store them in a + single `DataFrame`. In the latter case, finish the sequence of `vstack` + operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function : {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + A predefined aggregate function str or an expression. + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot( + ... values="baz", index="foo", columns="bar", aggregate_function="first" + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.arange(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ E ┆ 4 │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, by: str | Iterable[str], *more_by: str) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialises all frame data as a list of rows. + item: Return dataframe element as a scalar. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.rows() + [(1, 2), (3, 4), (5, 6)] + >>> df.rows(named=True) + [{\'a\': 1, \'b\': 2}, {\'a\': 3, \'b\': 4}, {\'a\': 5, \'b\': 6}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters in your use-case + you should export to a different format. + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + An iterator of tuples (default) or dictionaries (if named) of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return Pearson product-moment correlation coefficients. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + keyword arguments are passed to numpy corrcoef + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/expr/expr deleted file mode 100644 index d4c897f..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/expr/expr +++ /dev/null @@ -1,257 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: set[str] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: Expr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/expr/expr.pyi new file mode 100644 index 0000000..de88f12 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/expr/expr.pyi @@ -0,0 +1,6380 @@ +#: version 0.18.2 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def all(self) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + pl.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the output of an expression. + + Parameters + ---------- + name + New name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + >>> df.select( + ... pl.col("a").alias("bar"), + ... pl.col("b").alias("foo"), + ... ) + shape: (3, 2) + ┌─────┬──────┐ + │ bar ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + + Keep original column name to undo an alias operation. + + >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent + "DuplicateError: Column with name: \'literal\' has more than one occurrences" + errors. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to root column name. + + See Also + -------- + alias + map_alias + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().prefix("reverse_"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to root column name. + + See Also + -------- + alias + map_alias + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps root name to new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [3, 4], + ... } + ... ) + + >>> df.select(pl.all().reverse().suffix("_reverse")).with_columns( + ... pl.all().map_alias( + ... # Remove "_reverse" suffix and convert to lower case. + ... lambda col_name: col_name.rsplit("_reverse", 1)[0].lower() + ... ) + ... ) + shape: (2, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 2 ┆ 4 ┆ 2 ┆ 4 │ + │ 1 ┆ 3 ┆ 1 ┆ 3 │ + └───────────┴───────────┴─────┴─────┘ + + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: Expr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.fill_null(strategy="zero") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ 0 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(99) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ 99 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(strategy="forward") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.fill_nan("zero") + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪══════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ zero │ + │ zero ┆ 6.0 │ + └──────┴──────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 6 │ + │ null ┆ 6 │ + └──────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + See Also + -------- + map_dict + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + ExprStringNameSpace.explode : Explode a string column. + + """ + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + """ + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + """ + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of logical exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill nulls with linear interpolation over missing values. + + Can also be used to regrid data to a new grid - see examples below. + + Parameters + ---------- + method : {\'linear\', \'linear\'} + Interpolation method + + Examples + -------- + >>> # Fill nulls with linear interpolation + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If you pass a ``by`` column ````, then by default the + windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + Otherwise, the window at a given row will include the row itself, and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_min(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + └──────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If you pass a ``by`` column ````, then by default the + windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + Otherwise, the window at a given row will include the row itself, and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_max(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If you pass a ``by`` column ````, then by default the + windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + Otherwise, the window at a given row will include the row itself, and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 8.0, 6.0, 2.0, 16.0, 10.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_mean(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.5 │ + │ 7.0 │ + │ 4.0 │ + │ 9.0 │ + │ 13.0 │ + └──────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If you pass a ``by`` column ````, then by default the + windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + Otherwise, the window at a given row will include the row itself, and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_sum(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 3.0 │ + │ 5.0 │ + │ 7.0 │ + │ 9.0 │ + │ 11.0 │ + └──────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If you pass a ``by`` column ````, then by default the + windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + Otherwise, the window at a given row will include the row itself, and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_std(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 1.527525 │ + │ 2.0 │ + └──────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If you pass a ``by`` column ````, then by default the + windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + Otherwise, the window at a given row will include the row itself, and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_var(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 2.333333 │ + │ 4.0 │ + └──────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If you pass a ``by`` column ````, then by default the + windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + Otherwise, the window at a given row will include the row itself, and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_median(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If you pass a ``by`` column ````, then by default the + windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + Otherwise, the window at a given row will include the row itself, and the + `window_size - 1` elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_quantile(quantile=0.33, window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + └──────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f32 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + This can actually hurt performance and can have a lot of contention. + It is advised not to use it until actually benchmarked on your problem. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/lazyframe/frame deleted file mode 100644 index 1d45f68..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/lazyframe/frame +++ /dev/null @@ -1,128 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr, Series as Series -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath -from typing import Any, Callable, Collection, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: set[str] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: str = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other): ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..566e03e --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/lazyframe/frame.pyi @@ -0,0 +1,3334 @@ +#: version 0.18.2 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.groupby_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + * \'datapoint\': Start from the first encountered data point. + * \'monday\': Start the window on the monday before the first data point. + * \'tuesday\': Start the window on the tuesday before the first data point. + * ... + * \'sunday\': Start the window on the sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_rolling + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'m:m\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/series/series deleted file mode 100644 index c014a4b..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/series/series +++ /dev/null @@ -1,336 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, find_stacklevel as find_stacklevel, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Generator, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: set[str] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Self: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Self: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Self: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Self: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Self: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - def __floordiv__(self, other): ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, maintain_order: bool = ...) -> DataFrame: ... - def qcut(self, quantiles: list[float], *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool | None = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/series/series.pyi new file mode 100644 index 0000000..6fe4d3c --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.2/polars/series/series.pyi @@ -0,0 +1,4103 @@ +#: version 0.18.2 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, find_stacklevel as find_stacklevel, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self | Expr: ... + def __ne__(self, other: Any) -> Self | Expr: ... + def __gt__(self, other: Any) -> Self | Expr: ... + def __lt__(self, other: Any) -> Self | Expr: ... + def __ge__(self, other: Any) -> Self | Expr: ... + def __le__(self, other: Any) -> Self | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def any(self) -> bool: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self) -> bool: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ + │ 25% ┆ 2.0 │ + │ 75% ┆ 4.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ...) -> DataFrame: + ''' + Bin values into discrete values. + + Parameters + ---------- + bins + Bins to create. + labels + Labels to assign to the bins. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1]) + shape: (12, 3) + ┌──────┬─────────────┬──────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪══════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ + │ 1.5 ┆ inf ┆ (1.0, inf] │ + │ 2.0 ┆ inf ┆ (1.0, inf] │ + │ 2.5 ┆ inf ┆ (1.0, inf] │ + └──────┴─────────────┴──────────────┘ + + ''' + def qcut(self, quantiles: list[float]) -> DataFrame: + ''' + Bin values into discrete values based on their quantiles. + + Parameters + ---------- + quantiles + Quaniles to create. + We expect quantiles ``0.0 <= quantile <= 1`` + labels + Labels to assign to the quantiles. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut([0.0, 0.25, 0.75]) + shape: (8, 3) + ┌──────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪═══════════════╡ + │ -5.0 ┆ -5.0 ┆ (-inf, -5.0] │ + │ -4.0 ┆ -3.25 ┆ (-5.0, -3.25] │ + │ -3.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1.0 ┆ inf ┆ (0.25, inf] │ + │ 2.0 ┆ inf ┆ (0.25, inf] │ + └──────┴─────────────┴───────────────┘ + + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Return a copy of the Series with a new alias/name. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> srs = pl.Series("x", [1, 2, 3]) + >>> new_aliased_srs = srs.alias("y") + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Series: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.append(s2) + shape: (6,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ListNameSpace.explode : Explode a list column. + StringNameSpace.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/dataframe/frame deleted file mode 100644 index 37abc8d..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/dataframe/frame +++ /dev/null @@ -1,282 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Null as Null, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, Struct as Struct, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: set[str] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @columns.setter - def columns(self, names: Sequence[str]) -> None: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ...) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs): ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/dataframe/frame.pyi new file mode 100644 index 0000000..04413ce --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/dataframe/frame.pyi @@ -0,0 +1,5783 @@ +#: version 0.18.3 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Null as Null, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, Struct as Struct, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series, dim: int) -> Series: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to append to or create in the SQL database. + connection_uri + Connection uri, for example + + * "postgresql://username:password@server:port/database" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Note: Some polars data types like `Null`, `Categorical` and `Time` are + not supported by the delta protocol specification. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Examples + -------- + Instantiate a basic dataframe: + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + + Write DataFrame as a Delta Lake table on local filesystem. + + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on local filesystem. + Note: This will fail if schema of the new data does not match the + schema of existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + Note: If the schema of the new and old data is same, + then setting `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table on cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable that yields column names. Will be used to + replace the columns in the DataFrame. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + It is better to implement this with an expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a Series by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, df: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + df + DataFrame to stack. + in_place + Modify in place + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: Self) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this + `DataFrame` `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. + For instance during online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a + query. For instance when you read in multiple files and when to store them in a + single `DataFrame`. In the latter case, finish the sequence of `vstack` + operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function : {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + A predefined aggregate function str or an expression. + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot( + ... values="baz", index="foo", columns="bar", aggregate_function="first" + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.arange(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ E ┆ 4 │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, by: str | Iterable[str], *more_by: str) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialises all frame data as a list of rows. + item: Return dataframe element as a scalar. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.rows() + [(1, 2), (3, 4), (5, 6)] + >>> df.rows(named=True) + [{\'a\': 1, \'b\': 2}, {\'a\': 3, \'b\': 4}, {\'a\': 5, \'b\': 6}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters in your use-case + you should export to a different format. + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + An iterator of tuples (default) or dictionaries (if named) of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return Pearson product-moment correlation coefficients. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + keyword arguments are passed to numpy corrcoef + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/expr/expr deleted file mode 100644 index d4c897f..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/expr/expr +++ /dev/null @@ -1,257 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: set[str] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: Expr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/expr/expr.pyi new file mode 100644 index 0000000..c278446 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/expr/expr.pyi @@ -0,0 +1,6390 @@ +#: version 0.18.3 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def all(self) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + pl.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the output of an expression. + + Parameters + ---------- + name + New name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + >>> df.select( + ... pl.col("a").alias("bar"), + ... pl.col("b").alias("foo"), + ... ) + shape: (3, 2) + ┌─────┬──────┐ + │ bar ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + + Keep original column name to undo an alias operation. + + >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent + "DuplicateError: Column with name: \'literal\' has more than one occurrences" + errors. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to root column name. + + See Also + -------- + alias + map_alias + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().prefix("reverse_"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to root column name. + + See Also + -------- + alias + map_alias + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps root name to new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [3, 4], + ... } + ... ) + + >>> df.select(pl.all().reverse().suffix("_reverse")).with_columns( + ... pl.all().map_alias( + ... # Remove "_reverse" suffix and convert to lower case. + ... lambda col_name: col_name.rsplit("_reverse", 1)[0].lower() + ... ) + ... ) + shape: (2, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 2 ┆ 4 ┆ 2 ┆ 4 │ + │ 1 ┆ 3 ┆ 1 ┆ 3 │ + └───────────┴───────────┴─────┴─────┘ + + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: Expr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.fill_null(strategy="zero") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ 0 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(99) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ 99 ┆ 6 │ + └─────┴─────┘ + >>> df.fill_null(strategy="forward") + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.fill_nan("zero") + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪══════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ zero │ + │ zero ┆ 6.0 │ + └──────┴──────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 6 │ + │ null ┆ 6 │ + └──────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + ExprStringNameSpace.explode : Explode a string column. + + """ + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + """ + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + """ + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of logical exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill nulls with linear interpolation over missing values. + + Can also be used to regrid data to a new grid - see examples below. + + Parameters + ---------- + method : {\'linear\', \'linear\'} + Interpolation method + + Examples + -------- + >>> # Fill nulls with linear interpolation + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If you pass a ``by`` column ````, then by default the + windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + Otherwise, the window at a given row will include the row itself, and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_min(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + └──────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If you pass a ``by`` column ````, then by default the + windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + Otherwise, the window at a given row will include the row itself, and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_max(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 5.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If you pass a ``by`` column ````, then by default the + windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + Otherwise, the window at a given row will include the row itself, and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 8.0, 6.0, 2.0, 16.0, 10.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_mean(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.5 │ + │ 7.0 │ + │ 4.0 │ + │ 9.0 │ + │ 13.0 │ + └──────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If you pass a ``by`` column ````, then by default the + windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + Otherwise, the window at a given row will include the row itself, and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_sum(window_size=2), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 3.0 │ + │ 5.0 │ + │ 7.0 │ + │ 9.0 │ + │ 11.0 │ + └──────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If you pass a ``by`` column ````, then by default the + windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + Otherwise, the window at a given row will include the row itself, and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_std(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 1.527525 │ + │ 2.0 │ + └──────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If you pass a ``by`` column ````, then by default the + windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + Otherwise, the window at a given row will include the row itself, and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_var(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 1.0 │ + │ 2.333333 │ + │ 4.0 │ + └──────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If you pass a ``by`` column ````, then by default the + windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + Otherwise, the window at a given row will include the row itself, and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_median(window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + │ 6.0 │ + └──────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If you pass a ``by`` column ````, then by default the + windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + Otherwise, the window at a given row will include the row itself, and the + `window_size - 1` elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]}) + >>> df.select( + ... [ + ... pl.col("A").rolling_quantile(quantile=0.33, window_size=3), + ... ] + ... ) + shape: (6, 1) + ┌──────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 1.0 │ + │ 2.0 │ + │ 3.0 │ + │ 4.0 │ + └──────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f32 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + This can actually hurt performance and can have a lot of contention. + It is advised not to use it until actually benchmarked on your problem. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/lazyframe/frame deleted file mode 100644 index 4a23d5b..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/lazyframe/frame +++ /dev/null @@ -1,128 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr, Series as Series -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath -from typing import Any, Callable, Collection, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: set[str] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other): ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..b3c5a69 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/lazyframe/frame.pyi @@ -0,0 +1,3357 @@ +#: version 0.18.3 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: bytes, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.groupby_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_rolling + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/series/series deleted file mode 100644 index c014a4b..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/series/series +++ /dev/null @@ -1,336 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, find_stacklevel as find_stacklevel, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Generator, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: set[str] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Self: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Self: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Self: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Self: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Self: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - def __floordiv__(self, other): ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, maintain_order: bool = ...) -> DataFrame: ... - def qcut(self, quantiles: list[float], *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool | None = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/series/series.pyi new file mode 100644 index 0000000..a023088 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.3/polars/series/series.pyi @@ -0,0 +1,4108 @@ +#: version 0.18.3 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, find_stacklevel as find_stacklevel, is_int_sequence as is_int_sequence, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self | Expr: ... + def __ne__(self, other: Any) -> Self | Expr: ... + def __gt__(self, other: Any) -> Self | Expr: ... + def __lt__(self, other: Any) -> Self | Expr: ... + def __ge__(self, other: Any) -> Self | Expr: ... + def __le__(self, other: Any) -> Self | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int] | list[bool]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def any(self) -> bool: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self) -> bool: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ + │ 25% ┆ 2.0 │ + │ 75% ┆ 4.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ...) -> DataFrame: + ''' + Bin values into discrete values. + + Parameters + ---------- + bins + Bins to create. + labels + Labels to assign to the bins. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1]) + shape: (12, 3) + ┌──────┬─────────────┬──────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪══════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ + │ 1.5 ┆ inf ┆ (1.0, inf] │ + │ 2.0 ┆ inf ┆ (1.0, inf] │ + │ 2.5 ┆ inf ┆ (1.0, inf] │ + └──────┴─────────────┴──────────────┘ + + ''' + def qcut(self, quantiles: list[float]) -> DataFrame: + ''' + Bin values into discrete values based on their quantiles. + + Parameters + ---------- + quantiles + Quaniles to create. + We expect quantiles ``0.0 <= quantile <= 1`` + labels + Labels to assign to the quantiles. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut([0.0, 0.25, 0.75]) + shape: (8, 3) + ┌──────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪═══════════════╡ + │ -5.0 ┆ -5.0 ┆ (-inf, -5.0] │ + │ -4.0 ┆ -3.25 ┆ (-5.0, -3.25] │ + │ -3.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1.0 ┆ inf ┆ (0.25, inf] │ + │ 2.0 ┆ inf ┆ (0.25, inf] │ + └──────┴─────────────┴───────────────┘ + + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Return a copy of the Series with a new alias/name. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> srs = pl.Series("x", [1, 2, 3]) + >>> new_aliased_srs = srs.alias("y") + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Series: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.append(s2) + shape: (6,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ListNameSpace.explode : Explode a list column. + StringNameSpace.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/dataframe/frame deleted file mode 100644 index d6ae2d9..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/dataframe/frame +++ /dev/null @@ -1,281 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Null as Null, Object as Object, Struct as Struct, Time as Time, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: set[str] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @columns.setter - def columns(self, names: Sequence[str]) -> None: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ..., drop_first: bool = ...) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs): ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/dataframe/frame.pyi new file mode 100644 index 0000000..deaf988 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/dataframe/frame.pyi @@ -0,0 +1,5790 @@ +#: version 0.18.4 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Null as Null, Object as Object, Struct as Struct, Time as Time, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype, unpack_dtypes as unpack_dtypes +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, data: Sequence[Sequence[Any]], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to append to or create in the SQL database. + connection_uri + Connection uri, for example + + * "postgresql://username:password@server:port/database" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Note: Some polars data types like `Null`, `Categorical` and `Time` are + not supported by the delta protocol specification. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Examples + -------- + Instantiate a basic dataframe: + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + + Write DataFrame as a Delta Lake table on local filesystem. + + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on local filesystem. + Note: This will fail if schema of the new data does not match the + schema of existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + Note: If the schema of the new and old data is same, + then setting `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table on cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable that yields column names. Will be used to + replace the columns in the DataFrame. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + It is better to implement this with an expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a Series by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, df: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + df + DataFrame to stack. + in_place + Modify in place + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: Self) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this + `DataFrame` `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. + For instance during online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a + query. For instance when you read in multiple files and when to store them in a + single `DataFrame`. In the latter case, finish the sequence of `vstack` + operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot( + ... values="baz", index="foo", columns="bar", aggregate_function="first" + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.arange(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ E ┆ 4 │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, by: str | Iterable[str], *more_by: str) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialises all frame data as a list of rows. + item: Return dataframe element as a scalar. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.rows() + [(1, 2), (3, 4), (5, 6)] + >>> df.rows(named=True) + [{\'a\': 1, \'b\': 2}, {\'a\': 3, \'b\': 4}, {\'a\': 5, \'b\': 6}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters in your use-case + you should export to a different format. + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + An iterator of tuples (default) or dictionaries (if named) of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return Pearson product-moment correlation coefficients. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + keyword arguments are passed to numpy corrcoef + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/expr/expr deleted file mode 100644 index d2746c0..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/expr/expr +++ /dev/null @@ -1,259 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: set[str] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: Expr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def degrees(self) -> Self: ... - def radians(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/expr/expr.pyi new file mode 100644 index 0000000..f518f07 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/expr/expr.pyi @@ -0,0 +1,7315 @@ +#: version 0.18.4 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def all(self) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + pl.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the output of an expression. + + Parameters + ---------- + name + New name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + >>> df.select( + ... pl.col("a").alias("bar"), + ... pl.col("b").alias("foo"), + ... ) + shape: (3, 2) + ┌─────┬──────┐ + │ bar ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + + Keep original column name to undo an alias operation. + + >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent + "DuplicateError: Column with name: \'literal\' has more than one occurrences" + errors. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to root column name. + + See Also + -------- + alias + map_alias + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().prefix("reverse_"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to root column name. + + See Also + -------- + alias + map_alias + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps root name to new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [3, 4], + ... } + ... ) + + >>> df.select(pl.all().reverse().suffix("_reverse")).with_columns( + ... pl.all().map_alias( + ... # Remove "_reverse" suffix and convert to lower case. + ... lambda col_name: col_name.rsplit("_reverse", 1)[0].lower() + ... ) + ... ) + shape: (2, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 2 ┆ 4 ┆ 2 ┆ 4 │ + │ 1 ┆ 3 ┆ 1 ┆ 3 │ + └───────────┴───────────┴─────┴─────┘ + + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: Expr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + ExprStringNameSpace.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq_missing(pl.col("y")).alias("x == y"), + ... ) + shape: (6, 3) + ┌──────┬──────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞══════╪══════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + │ null ┆ 5.0 ┆ false │ + │ null ┆ null ┆ true │ + └──────┴──────┴────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne_missing(pl.col("y")).alias("x != y"), + ... ) + shape: (6, 3) + ┌──────┬──────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞══════╪══════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + │ null ┆ 5.0 ┆ true │ + │ null ┆ null ┆ false │ + └──────┴──────┴────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of logical exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill nulls with linear interpolation over missing values. + + Can also be used to regrid data to a new grid - see examples below. + + Parameters + ---------- + method : {\'linear\', \'linear\'} + Interpolation method + + Examples + -------- + >>> # Fill nulls with linear interpolation + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, *args, **kwargs) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, *args, **kwargs) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, *args, **kwargs) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, *args, **kwargs) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, *args, **kwargs) -> Self: + ''' + Compute a rolling standard deviation. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.883883 │ + │ 3.0 ┆ 1.237437 │ + │ 4.0 ┆ 1.59099 │ + │ 5.0 ┆ 1.944544 │ + │ 6.0 ┆ 2.298097 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, *args, **kwargs) -> Self: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.78125 │ + │ 3.0 ┆ 1.53125 │ + │ 4.0 ┆ 2.53125 │ + │ 5.0 ┆ 3.78125 │ + │ 6.0 ┆ 5.28125 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, *args, **kwargs) -> Self: + ''' + Compute a rolling median. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.875 │ + │ 3.0 ┆ 1.375 │ + │ 4.0 ┆ 1.875 │ + │ 5.0 ┆ 2.375 │ + │ 6.0 ┆ 2.875 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, *args, **kwargs) -> Self: + ''' + Compute a rolling quantile. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be + multiplied elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 0.8 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.2 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f32 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + ExprListNameSpace.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + This can actually hurt performance and can have a lot of contention. + It is advised not to use it until actually benchmarked on your problem. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/lazyframe/frame deleted file mode 100644 index 315d634..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/lazyframe/frame +++ /dev/null @@ -1,128 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr, Series as Series -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath -from typing import Any, Callable, Collection, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: set[str] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other): ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..da95262 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/lazyframe/frame.pyi @@ -0,0 +1,3357 @@ +#: version 0.18.4 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: Expr | str | Series | list[bool]) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.groupby_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often to type Date/Datetime + This column must be sorted in ascending order. If not the output will not + make sense. + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_rolling + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 day) + - 1w (1 week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + """ + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + """ + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/series/series deleted file mode 100644 index 8c46e2a..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/series/series +++ /dev/null @@ -1,337 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, find_stacklevel as find_stacklevel, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, Collection, Generator, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: set[str] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Self: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Self: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Self: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Self: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Self: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - def __floordiv__(self, other): ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, maintain_order: bool = ...) -> DataFrame: ... - def qcut(self, quantiles: list[float], *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ...) -> DataFrame: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool | None = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/series/series.pyi new file mode 100644 index 0000000..7c559e2 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.4/polars/series/series.pyi @@ -0,0 +1,4109 @@ +#: version 0.18.4 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, find_stacklevel as find_stacklevel, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: ClassVar[None] = ... + _accessors: ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self | Expr: ... + def __ne__(self, other: Any) -> Self | Expr: ... + def __gt__(self, other: Any) -> Self | Expr: ... + def __lt__(self, other: Any) -> Self | Expr: ... + def __ge__(self, other: Any) -> Self | Expr: ... + def __le__(self, other: Any) -> Self | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def any(self) -> bool: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self) -> bool: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ + │ 25% ┆ 2.0 │ + │ 75% ┆ 4.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ...) -> DataFrame: + ''' + Bin values into discrete values. + + Parameters + ---------- + bins + Bins to create. + labels + Labels to assign to the bins. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1]) + shape: (12, 3) + ┌──────┬─────────────┬──────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪══════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ + │ 1.5 ┆ inf ┆ (1.0, inf] │ + │ 2.0 ┆ inf ┆ (1.0, inf] │ + │ 2.5 ┆ inf ┆ (1.0, inf] │ + └──────┴─────────────┴──────────────┘ + + ''' + def qcut(self, quantiles: list[float]) -> DataFrame: + ''' + Bin values into discrete values based on their quantiles. + + Parameters + ---------- + quantiles + List of quantiles to create. + We expect quantiles ``0.0 <= quantile <= 1`` + labels + Labels to assign to the quantiles. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. + category_label + Name given to the category column. + maintain_order + Keep the order of the original `Series`. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut([0.0, 0.25, 0.75]) + shape: (8, 3) + ┌──────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪═══════════════╡ + │ -5.0 ┆ -5.0 ┆ (-inf, -5.0] │ + │ -4.0 ┆ -3.25 ┆ (-5.0, -3.25] │ + │ -3.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1.0 ┆ inf ┆ (0.25, inf] │ + │ 2.0 ┆ inf ┆ (0.25, inf] │ + └──────┴─────────────┴───────────────┘ + + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Return a copy of the Series with a new alias/name. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> srs = pl.Series("x", [1, 2, 3]) + >>> new_aliased_srs = srs.alias("y") + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Series: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.append(s2) + shape: (6,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + ListNameSpace.explode : Explode a list column. + StringNameSpace.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + ListNameSpace.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.5/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.5/polars/dataframe/frame deleted file mode 100644 index bc3c2db..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.5/polars/dataframe/frame +++ /dev/null @@ -1,282 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase, TextIOWrapper -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Null as Null, Object as Object, Struct as Struct, Time as Time, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IndexOrder as IndexOrder, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SelectorType as SelectorType, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, ClassVar, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: ClassVar[set[str]] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @columns.setter - def columns(self, names: Sequence[str]) -> None: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ..., *, order: IndexOrder = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ..., drop_first: bool = ...) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - def rows_by_key(self, key: str | Sequence[str] | SelectorType, *, named: bool = ..., include_key: bool = ..., unique: bool = ...) -> dict[Any, Iterable[Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs: Any) -> DataFrame: ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.5/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.5/polars/expr/expr deleted file mode 100644 index b3d4851..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.5/polars/expr/expr +++ /dev/null @@ -1,261 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: ClassVar[set[str]] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: IntoExpr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def cut(self, breaks: list[float], labels: list[str] | None = ..., left_closed: bool = ...) -> Self: ... - def qcut(self, probs: list[float], labels: list[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ...) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def degrees(self) -> Self: ... - def radians(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ..., fixed_seed: bool = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ..., fixed_seed: bool = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.5/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.5/polars/lazyframe/frame deleted file mode 100644 index c4e7e0e..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.5/polars/lazyframe/frame +++ /dev/null @@ -1,128 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath -from typing import Any, Callable, ClassVar, Collection, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: ClassVar[set[str]] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, predicate: IntoExpr) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other: Self | list[Self]) -> Self: ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.5/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.5/polars/series/series deleted file mode 100644 index 5c7cef7..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.5/polars/series/series +++ /dev/null @@ -1,340 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, find_stacklevel as find_stacklevel, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Generator, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: ClassVar[set[str]] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Self: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Self: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Self: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Self: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Self: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - @overload - def __floordiv__(self, other: Expr) -> Expr: ... - @overload - def __floordiv__(self, other: Any) -> Series: ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, maintain_order: bool = ..., series: bool = ..., left_closed: bool = ...) -> DataFrame | Series: ... - def qcut(self, quantiles: list[float], *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ..., series: bool = ..., left_closed: bool = ..., allow_duplicates: bool = ...) -> DataFrame | Series: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool | None = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series[Any]: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/dataframe/frame deleted file mode 100644 index bc3c2db..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/dataframe/frame +++ /dev/null @@ -1,282 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase, TextIOWrapper -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Null as Null, Object as Object, Struct as Struct, Time as Time, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IndexOrder as IndexOrder, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SelectorType as SelectorType, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, ClassVar, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: ClassVar[set[str]] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @columns.setter - def columns(self, names: Sequence[str]) -> None: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ..., *, order: IndexOrder = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ..., drop_first: bool = ...) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - def rows_by_key(self, key: str | Sequence[str] | SelectorType, *, named: bool = ..., include_key: bool = ..., unique: bool = ...) -> dict[Any, Iterable[Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs: Any) -> DataFrame: ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/dataframe/frame.pyi new file mode 100644 index 0000000..b2379f1 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/dataframe/frame.pyi @@ -0,0 +1,5967 @@ +#: version 0.18.6 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Null as Null, Object as Object, Struct as Struct, Time as Time, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + ``structured`` is set to ``False`` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to append to or create in the SQL database. + connection_uri + Connection uri, for example + + * "postgresql://username:password@server:port/database" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Note: Some polars data types like `Null`, `Categorical` and `Time` are + not supported by the delta protocol specification. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Examples + -------- + Instantiate a basic dataframe: + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + + Write DataFrame as a Delta Lake table on local filesystem. + + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on local filesystem. + Note: This will fail if schema of the new data does not match the + schema of existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + Note: If the schema of the new and old data is same, + then setting `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table on cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable that yields column names. Will be used to + replace the columns in the DataFrame. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + It is better to implement this with an expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a Series by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, df: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + df + DataFrame to stack. + in_place + Modify in place + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: Self) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this + `DataFrame` `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. + For instance during online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a + query. For instance when you read in multiple files and when to store them in a + single `DataFrame`. In the latter case, finish the sequence of `vstack` + operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot( + ... values="baz", index="foo", columns="bar", aggregate_function="first" + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.arange(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ E ┆ 4 │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, by: str | Iterable[str], *more_by: str) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format, as this method returns only python-native values. + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using ``iter_rows`` instead to avoid + materialising all the data at once. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + ''' + def rows_by_key(self, key: str | Sequence[str] | SelectorType) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format, as this method returns only python-native values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters in your use-case + you should export to a different format. + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + An iterator of tuples (default) or dictionaries (if named) of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy ``corrcoef``. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/expr/expr deleted file mode 100644 index b3d4851..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/expr/expr +++ /dev/null @@ -1,261 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: ClassVar[set[str]] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: IntoExpr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def cut(self, breaks: list[float], labels: list[str] | None = ..., left_closed: bool = ...) -> Self: ... - def qcut(self, probs: list[float], labels: list[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ...) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def degrees(self) -> Self: ... - def radians(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ..., fixed_seed: bool = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ..., fixed_seed: bool = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/expr/expr.pyi new file mode 100644 index 0000000..0d7afa4 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/expr/expr.pyi @@ -0,0 +1,7508 @@ +#: version 0.18.6 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def all(self) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the output of an expression. + + Parameters + ---------- + name + New name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + >>> df.select( + ... pl.col("a").alias("bar"), + ... pl.col("b").alias("foo"), + ... ) + shape: (3, 2) + ┌─────┬──────┐ + │ bar ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + + Keep original column name to undo an alias operation. + + >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent + "DuplicateError: Column with name: \'literal\' has more than one occurrences" + errors. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to root column name. + + See Also + -------- + alias + map_alias + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().prefix("reverse_"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to root column name. + + See Also + -------- + alias + map_alias + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps root name to new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [3, 4], + ... } + ... ) + + >>> df.select(pl.all().reverse().suffix("_reverse")).with_columns( + ... pl.all().map_alias( + ... # Remove "_reverse" suffix and convert to lower case. + ... lambda col_name: col_name.rsplit("_reverse", 1)[0].lower() + ... ) + ... ) + shape: (2, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 2 ┆ 4 ┆ 2 ┆ 4 │ + │ 1 ┆ 3 ┆ 1 ┆ 3 │ + └───────────┴───────────┴─────┴─────┘ + + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: list[float], labels: list[str] | None = ..., left_closed: bool = ...) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + A list of unique cut points. + labels + Labels to assign to bins. If given, the length must be len(probs) + 1. + left_closed + Whether intervals should be [) instead of the default of (] + + Examples + -------- + >>> g = pl.repeat("a", 5, eager=True).append(pl.repeat("b", 5, eager=True)) + >>> df = pl.DataFrame(dict(g=g, x=range(10))) + >>> df.with_columns(q=pl.col("x").cut([2, 5])) + shape: (10, 3) + ┌─────┬─────┬───────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═══════════╡ + │ a ┆ 0 ┆ (-inf, 2] │ + │ a ┆ 1 ┆ (-inf, 2] │ + │ a ┆ 2 ┆ (-inf, 2] │ + │ a ┆ 3 ┆ (2, 5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (5, inf] │ + │ b ┆ 7 ┆ (5, inf] │ + │ b ┆ 8 ┆ (5, inf] │ + │ b ┆ 9 ┆ (5, inf] │ + └─────┴─────┴───────────┘ + >>> df.with_columns(q=pl.col("x").cut([2, 5], left_closed=True)) + shape: (10, 3) + ┌─────┬─────┬───────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═══════════╡ + │ a ┆ 0 ┆ [-inf, 2) │ + │ a ┆ 1 ┆ [-inf, 2) │ + │ a ┆ 2 ┆ [2, 5) │ + │ a ┆ 3 ┆ [2, 5) │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ [5, inf) │ + │ b ┆ 7 ┆ [5, inf) │ + │ b ┆ 8 ┆ [5, inf) │ + │ b ┆ 9 ┆ [5, inf) │ + └─────┴─────┴───────────┘ + ''' + def qcut(self, probs: list[float], labels: list[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ...) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + probs + Probabilities for which to find the corresponding quantiles + For p in probs, we assume 0 <= p <= 1 + labels + Labels to assign to bins. If given, the length must be len(probs) + 1. + If computing over groups this must be set for now. + left_closed + Whether intervals should be [) instead of the default of (] + allow_duplicates + If True, the resulting quantile breaks don\'t have to be unique. This can + happen even with unique probs depending on the data. Duplicates will be + dropped, resulting in fewer bins. + + + Examples + -------- + >>> g = pl.repeat("a", 5, eager=True).append(pl.repeat("b", 5, eager=True)) + >>> df = pl.DataFrame(dict(g=g, x=range(10))) + >>> df.with_columns(q=pl.col("x").qcut([0.5])) + shape: (10, 3) + ┌─────┬─────┬─────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════════════╡ + │ a ┆ 0 ┆ (-inf, 4.5] │ + │ a ┆ 1 ┆ (-inf, 4.5] │ + │ a ┆ 2 ┆ (-inf, 4.5] │ + │ a ┆ 3 ┆ (-inf, 4.5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (4.5, inf] │ + │ b ┆ 7 ┆ (4.5, inf] │ + │ b ┆ 8 ┆ (4.5, inf] │ + │ b ┆ 9 ┆ (4.5, inf] │ + └─────┴─────┴─────────────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.5], ["lo", "hi"]).over("g")) + shape: (10, 3) + ┌─────┬─────┬─────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ lo │ + │ a ┆ 1 ┆ lo │ + │ a ┆ 2 ┆ lo │ + │ a ┆ 3 ┆ hi │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ lo │ + │ b ┆ 7 ┆ lo │ + │ b ┆ 8 ┆ hi │ + │ b ┆ 9 ┆ hi │ + └─────┴─────┴─────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.5], ["lo", "hi"], True).over("g")) + shape: (10, 3) + ┌─────┬─────┬─────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ lo │ + │ a ┆ 1 ┆ lo │ + │ a ┆ 2 ┆ hi │ + │ a ┆ 3 ┆ hi │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ lo │ + │ b ┆ 7 ┆ hi │ + │ b ┆ 8 ┆ hi │ + │ b ┆ 9 ┆ hi │ + └─────┴─────┴─────┘ + + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq_missing(pl.col("y")).alias("x == y"), + ... ) + shape: (6, 3) + ┌──────┬──────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞══════╪══════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + │ null ┆ 5.0 ┆ false │ + │ null ┆ null ┆ true │ + └──────┴──────┴────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne_missing(pl.col("y")).alias("x != y"), + ... ) + shape: (6, 3) + ┌──────┬──────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞══════╪══════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + │ null ┆ 5.0 ┆ true │ + │ null ┆ null ┆ false │ + └──────┴──────┴────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of logical exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill nulls with linear interpolation over missing values. + + Can also be used to regrid data to a new grid - see examples below. + + Parameters + ---------- + method : {\'linear\', \'linear\'} + Interpolation method + + Examples + -------- + >>> # Fill nulls with linear interpolation + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, *args, **kwargs) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, *args, **kwargs) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, *args, **kwargs) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, *args, **kwargs) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, *args, **kwargs) -> Self: + ''' + Compute a rolling standard deviation. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, *args, **kwargs) -> Self: + ''' + Compute a rolling variance. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, *args, **kwargs) -> Self: + ''' + Compute a rolling median. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, *args, **kwargs) -> Self: + ''' + Compute a rolling quantile. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f32 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ..., fixed_seed: bool = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + fixed_seed + If True, The seed will not be incremented between draws. + This can make output predictable because draw ordering can + change due to threads being scheduled in a different order. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + fixed_seed + If True, The seed will not be incremented between draws. + This can make output predictable because draw ordering can + change due to threads being scheduled in a different order. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + This can actually hurt performance and can have a lot of contention. + It is advised not to use it until actually benchmarked on your problem. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/lazyframe/frame deleted file mode 100644 index c4e7e0e..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/lazyframe/frame +++ /dev/null @@ -1,128 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath -from typing import Any, Callable, ClassVar, Collection, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: ClassVar[set[str]] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, predicate: IntoExpr) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other: Self | list[Self]) -> Self: ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..c4b2054 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/lazyframe/frame.pyi @@ -0,0 +1,3417 @@ +#: version 0.18.6 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: IntoExpr) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.groupby_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_rolling + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.arange(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/series/series deleted file mode 100644 index 5c7cef7..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/series/series +++ /dev/null @@ -1,340 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, find_stacklevel as find_stacklevel, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Generator, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: ClassVar[set[str]] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Self: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Self: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Self: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Self: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Self: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - @overload - def __floordiv__(self, other: Expr) -> Expr: ... - @overload - def __floordiv__(self, other: Any) -> Series: ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, maintain_order: bool = ..., series: bool = ..., left_closed: bool = ...) -> DataFrame | Series: ... - def qcut(self, quantiles: list[float], *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., maintain_order: bool = ..., series: bool = ..., left_closed: bool = ..., allow_duplicates: bool = ...) -> DataFrame | Series: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool | None = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series[Any]: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/series/series.pyi new file mode 100644 index 0000000..4eb9169 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.6/polars/series/series.pyi @@ -0,0 +1,4179 @@ +#: version 0.18.6 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, find_stacklevel as find_stacklevel, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self | Expr: ... + def __ne__(self, other: Any) -> Self | Expr: ... + def __gt__(self, other: Any) -> Self | Expr: ... + def __lt__(self, other: Any) -> Self | Expr: ... + def __ge__(self, other: Any) -> Self | Expr: ... + def __le__(self, other: Any) -> Self | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def any(self) -> bool: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self) -> bool: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ + │ 25% ┆ 2.0 │ + │ 75% ┆ 4.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ...) -> DataFrame | Series: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + bins + Bins to create. + labels + Labels to assign to the bins. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. Only used if series == False + category_label + Name given to the category column. Only used if series == False + maintain_order + Keep the order of the original `Series`. Only used if series == False + series + If True, return the a categorical series in the data\'s original order + left_closed + Whether intervals should be [) instead of (] + + Returns + ------- + DataFrame or Series + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1]) + shape: (12, 3) + ┌──────┬─────────────┬──────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪══════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ + │ 1.5 ┆ inf ┆ (1.0, inf] │ + │ 2.0 ┆ inf ┆ (1.0, inf] │ + │ 2.5 ┆ inf ┆ (1.0, inf] │ + └──────┴─────────────┴──────────────┘ + >>> a.cut([-1, 1], series=True) + shape: (12,) + Series: \'a\' [cat] + [ + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-1, 1]" + "(-1, 1]" + "(-1, 1]" + "(-1, 1]" + "(1, inf]" + "(1, inf]" + "(1, inf]" + ] + >>> a.cut([-1, 1], series=True, left_closed=True) + shape: (12,) + Series: \'a\' [cat] + [ + "[-inf, -1)" + "[-inf, -1)" + "[-inf, -1)" + "[-inf, -1)" + "[-1, 1)" + "[-1, 1)" + "[-1, 1)" + "[-1, 1)" + "[1, inf)" + "[1, inf)" + "[1, inf)" + "[1, inf)" + ] + ''' + def qcut(self, quantiles: list[float]) -> DataFrame | Series: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + List of quantiles to create. + We expect quantiles ``0.0 <= quantile <= 1`` + labels + Labels to assign to the quantiles. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column. Only used if series == False. + category_label + Name given to the category column. Only used if series == False. + maintain_order + Keep the order of the original `Series`. Only used if series == False. + series + If True, return a categorical series in the data\'s original order + left_closed + Whether intervals should be [) instead of (] + allow_duplicates + If True, the resulting quantile breaks don\'t have to be unique. This can + happen even with unique probs depending on the data. Duplicates will be + dropped, resulting in fewer bins. + + Returns + ------- + DataFrame or Series + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut([0.0, 0.25, 0.75]) + shape: (8, 3) + ┌──────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪═══════════════╡ + │ -5.0 ┆ -5.0 ┆ (-inf, -5.0] │ + │ -4.0 ┆ -3.25 ┆ (-5.0, -3.25] │ + │ -3.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0.0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1.0 ┆ inf ┆ (0.25, inf] │ + │ 2.0 ┆ inf ┆ (0.25, inf] │ + └──────┴─────────────┴───────────────┘ + >>> a.qcut([0.0, 0.25, 0.75], series=True) + shape: (8,) + Series: \'a\' [cat] + [ + "(-inf, -5]" + "(-5, -3.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(0.25, inf]" + "(0.25, inf]" + ] + >>> a.qcut([0.0, 0.25, 0.75], series=True, left_closed=True) + shape: (8,) + Series: \'a\' [cat] + [ + "[-5, -3.25)" + "[-5, -3.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[0.25, inf)" + "[0.25, inf)" + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Return a copy of the Series with a new alias/name. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> srs = pl.Series("x", [1, 2, 3]) + >>> new_aliased_srs = srs.alias("y") + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Series: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.append(s2) + shape: (6,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/dataframe/frame deleted file mode 100644 index 76f7fd4..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/dataframe/frame +++ /dev/null @@ -1,280 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase, TextIOWrapper -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Null as Null, Object as Object, Struct as Struct, Time as Time, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IndexOrder as IndexOrder, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SelectorType as SelectorType, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, ClassVar, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: ClassVar[set[str]] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ..., *, order: IndexOrder = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, df: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: Self) -> Self: ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ..., drop_first: bool = ...) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - def rows_by_key(self, key: str | Sequence[str] | SelectorType, *, named: bool = ..., include_key: bool = ..., unique: bool = ...) -> dict[Any, Iterable[Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs: Any) -> DataFrame: ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/dataframe/frame.pyi new file mode 100644 index 0000000..2435fc5 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/dataframe/frame.pyi @@ -0,0 +1,5983 @@ +#: version 0.18.7 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Null as Null, Object as Object, Struct as Struct, Time as Time, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + ``structured`` is set to ``False`` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection_uri + Connection URI, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Note: Some polars data types like `Null`, `Categorical` and `Time` are + not supported by the delta protocol specification. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Examples + -------- + Instantiate a basic dataframe: + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + + Write DataFrame as a Delta Lake table on local filesystem. + + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on local filesystem. + Note: This will fail if schema of the new data does not match the + schema of existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + Note: If the schema of the new and old data is same, + then setting `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table on cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable that yields column names. Will be used to + replace the columns in the DataFrame. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + It is better to implement this with an expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a Series by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, df: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + df + DataFrame to stack. + in_place + Modify in place + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: Self) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this + `DataFrame` `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. + For instance during online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a + query. For instance when you read in multiple files and when to store them in a + single `DataFrame`. In the latter case, finish the sequence of `vstack` + operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot( + ... values="baz", index="foo", columns="bar", aggregate_function="first" + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.int_range(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ E ┆ 4 │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, by: str | Iterable[str], *more_by: str) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using ``iter_rows`` instead to avoid + materialising all the data at once. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + ''' + def rows_by_key(self, key: str | Sequence[str] | SelectorType) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + An iterator of tuples (default) or dictionaries (if named) of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy ``corrcoef``. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/expr/expr deleted file mode 100644 index 4491b04..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/expr/expr +++ /dev/null @@ -1,263 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: ClassVar[set[str]] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self) -> Self: ... - def all(self) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def keep_name(self) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: IntoExpr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def cut(self, breaks: list[float], labels: list[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: ... - def qcut(self, probs: list[float], labels: list[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> Self: ... - def rle(self) -> Self: ... - def rle_id(self) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def degrees(self) -> Self: ... - def radians(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ..., fixed_seed: bool = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ..., fixed_seed: bool = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/expr/expr.pyi new file mode 100644 index 0000000..0d4ec8c --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/expr/expr.pyi @@ -0,0 +1,7583 @@ +#: version 0.18.7 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def all(self) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the output of an expression. + + Parameters + ---------- + name + New name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + >>> df.select( + ... pl.col("a").alias("bar"), + ... pl.col("b").alias("foo"), + ... ) + shape: (3, 2) + ┌─────┬──────┐ + │ bar ┆ foo │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪══════╡ + │ 1 ┆ a │ + │ 2 ┆ b │ + │ 3 ┆ null │ + └─────┴──────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + + Keep original column name to undo an alias operation. + + >>> df.with_columns([(pl.col("a") * 9).alias("c").keep_name()]) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent + "DuplicateError: Column with name: \'literal\' has more than one occurrences" + errors. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to root column name. + + See Also + -------- + alias + map_alias + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().prefix("reverse_"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ reverse_A ┆ reverse_fruits ┆ reverse_B ┆ reverse_cars │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to root column name. + + See Also + -------- + alias + map_alias + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df + shape: (5, 4) + ┌─────┬────────┬─────┬────────┐ + │ A ┆ fruits ┆ B ┆ cars │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi │ + │ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 5 ┆ banana ┆ 1 ┆ beetle │ + └─────┴────────┴─────┴────────┘ + >>> df.select( + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps root name to new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2], + ... "B": [3, 4], + ... } + ... ) + + >>> df.select(pl.all().reverse().suffix("_reverse")).with_columns( + ... pl.all().map_alias( + ... # Remove "_reverse" suffix and convert to lower case. + ... lambda col_name: col_name.rsplit("_reverse", 1)[0].lower() + ... ) + ... ) + shape: (2, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 2 ┆ 4 ┆ 2 ┆ 4 │ + │ 1 ┆ 3 ┆ 1 ┆ 3 │ + └───────────┴───────────┴─────┴─────┘ + + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: list[float], labels: list[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + A list of unique cut points. + labels + Labels to assign to bins. If given, the length must be len(probs) + 1. + left_closed + Whether intervals should be [) instead of the default of (] + include_breaks + Include the the right endpoint of the bin each observation falls in. + If True, the resulting column will be a Struct. + + Examples + -------- + >>> g = pl.repeat("a", 5, eager=True).append(pl.repeat("b", 5, eager=True)) + >>> df = pl.DataFrame(dict(g=g, x=range(10))) + >>> df.with_columns(q=pl.col("x").cut([2, 5])) + shape: (10, 3) + ┌─────┬─────┬───────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═══════════╡ + │ a ┆ 0 ┆ (-inf, 2] │ + │ a ┆ 1 ┆ (-inf, 2] │ + │ a ┆ 2 ┆ (-inf, 2] │ + │ a ┆ 3 ┆ (2, 5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (5, inf] │ + │ b ┆ 7 ┆ (5, inf] │ + │ b ┆ 8 ┆ (5, inf] │ + │ b ┆ 9 ┆ (5, inf] │ + └─────┴─────┴───────────┘ + >>> df.with_columns(q=pl.col("x").cut([2, 5], left_closed=True)) + shape: (10, 3) + ┌─────┬─────┬───────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═══════════╡ + │ a ┆ 0 ┆ [-inf, 2) │ + │ a ┆ 1 ┆ [-inf, 2) │ + │ a ┆ 2 ┆ [2, 5) │ + │ a ┆ 3 ┆ [2, 5) │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ [5, inf) │ + │ b ┆ 7 ┆ [5, inf) │ + │ b ┆ 8 ┆ [5, inf) │ + │ b ┆ 9 ┆ [5, inf) │ + └─────┴─────┴───────────┘ + ''' + def qcut(self, probs: list[float], labels: list[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + probs + Probabilities for which to find the corresponding quantiles + For p in probs, we assume 0 <= p <= 1 + labels + Labels to assign to bins. If given, the length must be len(probs) + 1. + If computing over groups this must be set for now. + left_closed + Whether intervals should be [) instead of the default of (] + allow_duplicates + If True, the resulting quantile breaks don\'t have to be unique. This can + happen even with unique probs depending on the data. Duplicates will be + dropped, resulting in fewer bins. + include_breaks + Include the the right endpoint of the bin each observation falls in. + If True, the resulting column will be a Struct. + + + Examples + -------- + >>> g = pl.repeat("a", 5, eager=True).append(pl.repeat("b", 5, eager=True)) + >>> df = pl.DataFrame(dict(g=g, x=range(10))) + >>> df.with_columns(q=pl.col("x").qcut([0.5])) + shape: (10, 3) + ┌─────┬─────┬─────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════════════╡ + │ a ┆ 0 ┆ (-inf, 4.5] │ + │ a ┆ 1 ┆ (-inf, 4.5] │ + │ a ┆ 2 ┆ (-inf, 4.5] │ + │ a ┆ 3 ┆ (-inf, 4.5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (4.5, inf] │ + │ b ┆ 7 ┆ (4.5, inf] │ + │ b ┆ 8 ┆ (4.5, inf] │ + │ b ┆ 9 ┆ (4.5, inf] │ + └─────┴─────┴─────────────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.5], ["lo", "hi"]).over("g")) + shape: (10, 3) + ┌─────┬─────┬─────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ lo │ + │ a ┆ 1 ┆ lo │ + │ a ┆ 2 ┆ lo │ + │ a ┆ 3 ┆ hi │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ lo │ + │ b ┆ 7 ┆ lo │ + │ b ┆ 8 ┆ hi │ + │ b ┆ 9 ┆ hi │ + └─────┴─────┴─────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.5], ["lo", "hi"], True).over("g")) + shape: (10, 3) + ┌─────┬─────┬─────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ lo │ + │ a ┆ 1 ┆ lo │ + │ a ┆ 2 ┆ hi │ + │ a ┆ 3 ┆ hi │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ lo │ + │ b ┆ 7 ┆ hi │ + │ b ┆ 8 ┆ hi │ + │ b ┆ 9 ┆ hi │ + └─────┴─────┴─────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.25, 0.5], include_breaks=True)) + shape: (10, 3) + ┌─────┬─────┬───────────────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ struct[2] │ + ╞═════╪═════╪═══════════════════════╡ + │ a ┆ 0 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 1 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 2 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 3 ┆ {4.5,"(2.25, 4.5]"} │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 7 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 8 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 9 ┆ {inf,"(4.5, inf]"} │ + └─────┴─────┴───────────────────────┘ + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + A Struct Series containing "lengths" and "values" Fields + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬──────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪══════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ null ┆ 3 │ + └───────────┴──────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of logical "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more logical boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq_missing(pl.col("y")).alias("x == y"), + ... ) + shape: (6, 3) + ┌──────┬──────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞══════╪══════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + │ null ┆ 5.0 ┆ false │ + │ null ┆ null ┆ true │ + └──────┴──────┴────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne_missing(pl.col("y")).alias("x != y"), + ... ) + shape: (6, 3) + ┌──────┬──────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞══════╪══════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + │ null ┆ 5.0 ┆ true │ + │ null ┆ null ┆ false │ + └──────┴──────┴────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of logical exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill nulls with linear interpolation over missing values. + + Can also be used to regrid data to a new grid - see examples below. + + Parameters + ---------- + method : {\'linear\', \'linear\'} + Interpolation method + + Examples + -------- + >>> # Fill nulls with linear interpolation + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, *args, **kwargs) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, *args, **kwargs) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, *args, **kwargs) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, *args, **kwargs) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, *args, **kwargs) -> Self: + ''' + Compute a rolling standard deviation. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, *args, **kwargs) -> Self: + ''' + Compute a rolling variance. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, *args, **kwargs) -> Self: + ''' + Compute a rolling median. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, *args, **kwargs) -> Self: + ''' + Compute a rolling quantile. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f32 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ..., fixed_seed: bool = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + fixed_seed + If True, The seed will not be incremented between draws. + This can make output predictable because draw ordering can + change due to threads being scheduled in a different order. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + fixed_seed + If True, The seed will not be incremented between draws. + This can make output predictable because draw ordering can + change due to threads being scheduled in a different order. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + This can actually hurt performance and can have a lot of contention. + It is advised not to use it until actually benchmarked on your problem. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/lazyframe/frame deleted file mode 100644 index 3ce2762..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/lazyframe/frame +++ /dev/null @@ -1,128 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath -from typing import Any, Callable, ClassVar, Collection, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: ClassVar[set[str]] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, predicate: IntoExpr) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other: Self | list[Self]) -> Self: ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..a7e5a3a --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/lazyframe/frame.pyi @@ -0,0 +1,3429 @@ +#: version 0.18.7 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: IntoExpr) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.groupby_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_rolling + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/series/series deleted file mode 100644 index 5231dab..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/series/series +++ /dev/null @@ -1,343 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, find_stacklevel as find_stacklevel, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Generator, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: ClassVar[set[str]] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Self: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Self: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Self: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Self: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Self: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - @overload - def __floordiv__(self, other: Expr) -> Expr: ... - @overload - def __floordiv__(self, other: Any) -> Series: ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def any(self) -> bool: ... - def all(self) -> bool: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, series: bool = ..., left_closed: bool = ..., include_breaks: bool = ...) -> DataFrame | Series: ... - def qcut(self, quantiles: list[float], *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., series: bool = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> DataFrame | Series: ... - def rle(self) -> Series: ... - def rle_id(self) -> Series: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool | None = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool = ...) -> Series: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series[Any]: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/series/series.pyi new file mode 100644 index 0000000..8ac94cc --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.7/polars/series/series.pyi @@ -0,0 +1,4246 @@ +#: version 0.18.7 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, find_stacklevel as find_stacklevel, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self | Expr: ... + def __ne__(self, other: Any) -> Self | Expr: ... + def __gt__(self, other: Any) -> Self | Expr: ... + def __lt__(self, other: Any) -> Self | Expr: ... + def __ge__(self, other: Any) -> Self | Expr: ... + def __le__(self, other: Any) -> Self | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def any(self) -> bool: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self) -> bool: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ + │ 25% ┆ 2.0 │ + │ 75% ┆ 4.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, bins: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ...) -> DataFrame | Series: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + bins + Bins to create. + labels + Labels to assign to the bins. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column/field. Only used if series == False or + include_breaks == True + category_label + Name given to the category column. Only used if series == False + maintain_order + Keep the order of the original `Series`. Only used if series == False + series + If True, return the a categorical series in the data\'s original order. + left_closed + Whether intervals should be [) instead of (] + include_breaks + Include the the right endpoint of the bin each observation falls in. + If returning a DataFrame, it will be a column, and if returning a Series + it will be a field in a Struct + + Returns + ------- + DataFrame or Series + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1], series=False) + shape: (12, 3) + ┌──────┬─────────────┬────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1, 1] │ + │ 1.5 ┆ inf ┆ (1, inf] │ + │ 2.0 ┆ inf ┆ (1, inf] │ + │ 2.5 ┆ inf ┆ (1, inf] │ + └──────┴─────────────┴────────────┘ + >>> a.cut([-1, 1], series=True) + shape: (12,) + Series: \'a\' [cat] + [ + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-1, 1]" + "(-1, 1]" + "(-1, 1]" + "(-1, 1]" + "(1, inf]" + "(1, inf]" + "(1, inf]" + ] + >>> a.cut([-1, 1], series=True, left_closed=True) + shape: (12,) + Series: \'a\' [cat] + [ + "[-inf, -1)" + "[-inf, -1)" + "[-inf, -1)" + "[-inf, -1)" + "[-1, 1)" + "[-1, 1)" + "[-1, 1)" + "[-1, 1)" + "[1, inf)" + "[1, inf)" + "[1, inf)" + "[1, inf)" + ] + ''' + def qcut(self, quantiles: list[float]) -> DataFrame | Series: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + List of quantiles to create. + We expect quantiles ``0.0 <= quantile <= 1`` + labels + Labels to assign to the quantiles. If given the length of labels must be + len(bins) + 1. + break_point_label + Name given to the breakpoint column/field. Only used if series == False or + include_breaks == True + category_label + Name given to the category column. Only used if series == False. + maintain_order + Keep the order of the original `Series`. Only used if series == False. + series + If True, return a categorical series in the data\'s original order + left_closed + Whether intervals should be [) instead of (] + allow_duplicates + If True, the resulting quantile breaks don\'t have to be unique. This can + happen even with unique probs depending on the data. Duplicates will be + dropped, resulting in fewer bins. + include_breaks + Include the the right endpoint of the bin each observation falls in. + If returning a DataFrame, it will be a column, and if returning a Series + it will be a field in a Struct + + Returns + ------- + DataFrame or Series + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut([0.0, 0.25, 0.75], series=False) + shape: (8, 3) + ┌─────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪═══════════════╡ + │ -5 ┆ -5.0 ┆ (-inf, -5] │ + │ -4 ┆ -3.25 ┆ (-5, -3.25] │ + │ -3 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1 ┆ inf ┆ (0.25, inf] │ + │ 2 ┆ inf ┆ (0.25, inf] │ + └─────┴─────────────┴───────────────┘ + >>> a.qcut([0.0, 0.25, 0.75], series=True) + shape: (8,) + Series: \'a\' [cat] + [ + "(-inf, -5]" + "(-5, -3.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(0.25, inf]" + "(0.25, inf]" + ] + >>> a.qcut([0.0, 0.25, 0.75], series=True, left_closed=True) + shape: (8,) + Series: \'a\' [cat] + [ + "[-5, -3.25)" + "[-5, -3.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[0.25, inf)" + "[0.25, inf)" + ] + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + A Struct Series containing "lengths" and "values" Fields + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Return a copy of the Series with a new alias/name. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> srs = pl.Series("x", [1, 2, 3]) + >>> new_aliased_srs = srs.alias("y") + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Series: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.append(s2) + shape: (6,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/dataframe/frame deleted file mode 100644 index 5412dfe..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/dataframe/frame +++ /dev/null @@ -1,280 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase, TextIOWrapper -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Null as Null, Object as Object, Struct as Struct, Time as Time, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.lazy import col as col, lit as lit -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnTotalsDefinition as ColumnTotalsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IndexOrder as IndexOrder, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SelectorType as SelectorType, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils import NoDefault as NoDefault, no_default as no_default -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from pyarrow.interchange.dataframe import _PyArrowDataFrame as _PyArrowDataFrame -from typing import Any, BinaryIO, Callable, ClassVar, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: ClassVar[set[str]] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ..., ignore_errors: bool = ...) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ..., *, order: IndexOrder = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path, *, has_header: bool = ..., separator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: dict[str | tuple[str, ...], str | dict[str, str]] | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: dict[str | tuple[str, ...], int] | int | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ..., freeze_panes: str | tuple[str, int, int] | tuple[int, int] | tuple[int, int, int, int] | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, object] | None = ...) -> None: ... - def write_database(self, table_name: str, connection_uri: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: str | Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, other: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: DataFrame) -> Self: ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: str | Iterable[str], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: str | Sequence[str] | None = ..., *, separator: str = ..., drop_first: bool = ...) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - def rows_by_key(self, key: str | Sequence[str] | SelectorType, *, named: bool = ..., include_key: bool = ..., unique: bool = ...) -> dict[Any, Iterable[Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def corr(self, **kwargs: Any) -> DataFrame: ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/dataframe/frame.pyi new file mode 100644 index 0000000..f0b4edc --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/dataframe/frame.pyi @@ -0,0 +1,6048 @@ +#: version 0.18.8 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.groupby import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Null as Null, Object as Object, Struct as Struct, Time as Time, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col, lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, find_stacklevel as find_stacklevel, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, no_default as no_default, normalise_filepath as normalise_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + Returns + ------- + DataFrame + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Arrow IPC stream format. + + Arrow IPC is also know as Feather (v2). + + Parameters + ---------- + source + Path to a file or a file-like object. + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + Returns + ------- + DataFrame + + """ + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> _PyArrowDataFrame: + """ + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + allow_copy + Allow memory to be copied to perform the conversion. If set to False, causes + conversions that are not zero-copy to fail. + + Notes + ----- + Details on the dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + ``structured`` is set to ``False`` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname:str,}`` dictionary for applying an Excel format string to the + given columns. Formats defined here (such as "dd/mm/yyyy", "0.00%", etc) + will override any defined in ``dtype_formats`` (below). + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A ``{colname(s):str,}``, ``{colname(s):dict,}``, or ``{colname(s):list,}`` + dictionary defining conditional format options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` dict or single integer that sets (or overrides if + autofitting) table column widths in integer pixel units. If given as an + integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : {dict, int} + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list of table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + + Notes + ----- + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum") on all numeric columns, autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + ''' + def write_database(self, table_name: str, connection_uri: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection_uri + Connection URI, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Note: Some polars data types like `Null`, `Categorical` and `Time` are + not supported by the delta protocol specification. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Examples + -------- + Instantiate a basic dataframe: + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + + Write DataFrame as a Delta Lake table on local filesystem. + + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on local filesystem. + Note: This will fail if schema of the new data does not match the + schema of existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + Note: If the schema of the new and old data is same, + then setting `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table on cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_nulls() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops rows where any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.groupby("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.groupby("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.groupby(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.groupby("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.groupby("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.groupby_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.groupby_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + Joined DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.apply(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.apply(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, *args, **kwargs) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from ``vstack`` which adds the chunks from ``other`` to the chunks of + this ``DataFrame``, ``extend`` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``vstack`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer ``vstack`` over ``extend`` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single ``DataFrame``. In the latter case, finish the sequence of + ``vstack`` operations with a ``rechunk``. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> df.drop("foo", "bar") + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value to fill NaN with. + + Returns + ------- + DataFrame with NaN replaced with fill_value + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: Sequence[str] | str, index: Sequence[str] | str, columns: Sequence[str] | str, aggregate_function: PivotAgg | Expr | None | NoDefault = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "one", "two", "two", "two"], + ... "bar": ["A", "B", "C", "A", "B", "C"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot( + ... values="baz", index="foo", columns="bar", aggregate_function="first" + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ A ┆ B ┆ C │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ one ┆ 1 ┆ 2 ┆ 3 │ + │ two ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: str | Sequence[str] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Name of the column(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "col1": list(ascii_uppercase[0:9]), + ... "col2": pl.int_range(0, 9, eager=True), + ... } + ... ) + >>> df + shape: (9, 2) + ┌──────┬──────┐ + │ col1 ┆ col2 │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ A ┆ 0 │ + │ B ┆ 1 │ + │ C ┆ 2 │ + │ D ┆ 3 │ + │ E ┆ 4 │ + │ F ┆ 5 │ + │ G ┆ 6 │ + │ H ┆ 7 │ + │ I ┆ 8 │ + └──────┴──────┘ + >>> df.unstack(step=3, how="vertical") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │ + │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │ + │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + >>> df.unstack(step=3, how="horizontal") + shape: (3, 6) + ┌────────┬────────┬────────┬────────┬────────┬────────┐ + │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪════════╪════════╪════════╪════════╪════════╡ + │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │ + │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │ + │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │ + └────────┴────────┴────────┴────────┴────────┴────────┘ + + ''' + def partition_by(self, by: str | Iterable[str], *more_by: str) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Name of the column(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> df.partition_by("a", as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: str | Sequence[str] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Name of the column(s) that should be converted to dummy variables. + If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.groupby(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + Tuple (default) or dictionary of row values. + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using ``iter_rows`` instead to avoid + materialising all the data at once. + + Returns + ------- + A list of tuples (default) or dictionaries of row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + ''' + def rows_by_key(self, key: str | Sequence[str] | SelectorType) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + An iterator of tuples (default) or dictionaries (if named) of python row values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(record_batch, "\\n<< ", len(record_batch)) + ... + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 15000 + pyarrow.RecordBatch + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy ``corrcoef``. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/expr/expr deleted file mode 100644 index 1f470e4..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/expr/expr +++ /dev/null @@ -1,265 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.exceptions import PolarsPanicError as PolarsPanicError -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ApplyStrategy as ApplyStrategy, ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.decorators import deprecated_alias as deprecated_alias, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: ClassVar[set[str]] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr) -> Self: ... - def __rxor__(self, other: Expr) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self, drop_nulls: bool = ...) -> Self: ... - def all(self, drop_nulls: bool = ...) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def cbrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def keep_name(self) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: IntoExpr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def cut(self, breaks: list[float], labels: list[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: ... - def qcut(self, q: list[float] | int, labels: list[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> Self: ... - def rle(self) -> Self: ... - def rle_id(self) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: ApplyStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | None | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def degrees(self) -> Self: ... - def radians(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ..., fixed_seed: bool = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ..., fixed_seed: bool = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, multithreaded: bool = ..., sort: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/expr/expr.pyi new file mode 100644 index 0000000..5f40519 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/expr/expr.pyi @@ -0,0 +1,7672 @@ +#: version 0.18.8 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.decorators import deprecated_alias as deprecated_alias, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr) -> Self: ... + def __rxor__(self, other: Expr) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self, drop_nulls: bool = ...) -> Self: + ''' + Check if any boolean value in a Boolean column is `True`. + + Parameters + ---------- + drop_nulls + If False, return None if there are nulls but no Trues. + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame({"TF": [True, False], "FF": [False, False]}) + >>> df.select(pl.all().any()) + shape: (1, 2) + ┌──────┬───────┐ + │ TF ┆ FF │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ false │ + └──────┴───────┘ + >>> df = pl.DataFrame(dict(x=[None, False], y=[None, True])) + >>> df.select(pl.col("x").any(True), pl.col("y").any(True)) + shape: (1, 2) + ┌───────┬──────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪══════╡ + │ false ┆ true │ + └───────┴──────┘ + >>> df.select(pl.col("x").any(False), pl.col("y").any(False)) + shape: (1, 2) + ┌──────┬──────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪══════╡ + │ null ┆ true │ + └──────┴──────┘ + + ''' + def all(self, drop_nulls: bool = ...) -> Self: + ''' + Check if all boolean values in a Boolean column are `True`. + + This method is an expression - not to be confused with + :func:`polars.all` which is a function to select all columns. + + Parameters + ---------- + drop_nulls + If False, return None if there are any nulls. + + + Returns + ------- + Boolean literal + + Examples + -------- + >>> df = pl.DataFrame( + ... {"TT": [True, True], "TF": [True, False], "FF": [False, False]} + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ TT ┆ TF ┆ FF │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + >>> df = pl.DataFrame(dict(x=[None, False], y=[None, True])) + >>> df.select(pl.col("x").all(True), pl.col("y").all(True)) + shape: (1, 2) + ┌───────┬───────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + └───────┴───────┘ + >>> df.select(pl.col("x").all(False), pl.col("y").all(False)) + shape: (1, 2) + ┌──────┬──────┐ + │ x ┆ y │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪══════╡ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().map_alias(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").keep_name()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Iterable[str] | Iterable[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection. + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + out + Series of type Boolean + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done. + For instance, due to an overflow. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Series of dtype UInt32. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a groupby context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a groupby context, the groups are sorted. + + >>> df.groupby("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.groupby("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Values taken by index + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.groupby("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a groupby aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: list[float], labels: list[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + A list of unique cut points. + labels + Labels to assign to bins. If given, the length must be len(breaks) + 1. + left_closed + Whether intervals should be [) instead of the default of (] + include_breaks + Include the the right endpoint of the bin each observation falls in. + If True, the resulting column will be a Struct. + + Examples + -------- + >>> g = pl.repeat("a", 5, eager=True).append(pl.repeat("b", 5, eager=True)) + >>> df = pl.DataFrame(dict(g=g, x=range(10))) + >>> df.with_columns(q=pl.col("x").cut([2, 5])) + shape: (10, 3) + ┌─────┬─────┬───────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═══════════╡ + │ a ┆ 0 ┆ (-inf, 2] │ + │ a ┆ 1 ┆ (-inf, 2] │ + │ a ┆ 2 ┆ (-inf, 2] │ + │ a ┆ 3 ┆ (2, 5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (5, inf] │ + │ b ┆ 7 ┆ (5, inf] │ + │ b ┆ 8 ┆ (5, inf] │ + │ b ┆ 9 ┆ (5, inf] │ + └─────┴─────┴───────────┘ + >>> df.with_columns(q=pl.col("x").cut([2, 5], left_closed=True)) + shape: (10, 3) + ┌─────┬─────┬───────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═══════════╡ + │ a ┆ 0 ┆ [-inf, 2) │ + │ a ┆ 1 ┆ [-inf, 2) │ + │ a ┆ 2 ┆ [2, 5) │ + │ a ┆ 3 ┆ [2, 5) │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ [5, inf) │ + │ b ┆ 7 ┆ [5, inf) │ + │ b ┆ 8 ┆ [5, inf) │ + │ b ┆ 9 ┆ [5, inf) │ + └─────┴─────┴───────────┘ + ''' + def qcut(self, *args, **kwargs) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + q + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of evenly spaced probabilities to use. + labels + Labels to assign to bins. If given, the length must be len(probs) + 1. + If computing over groups this must be set for now. + left_closed + Whether intervals should be [) instead of the default of (] + allow_duplicates + If True, the resulting quantile breaks don\'t have to be unique. This can + happen even with unique probs depending on the data. Duplicates will be + dropped, resulting in fewer bins. + include_breaks + Include the the right endpoint of the bin each observation falls in. + If True, the resulting column will be a Struct. + + + Examples + -------- + >>> g = pl.repeat("a", 5, eager=True).append(pl.repeat("b", 5, eager=True)) + >>> df = pl.DataFrame(dict(g=g, x=range(10))) + >>> df.with_columns(q=pl.col("x").qcut([0.5])) + shape: (10, 3) + ┌─────┬─────┬─────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════════════╡ + │ a ┆ 0 ┆ (-inf, 4.5] │ + │ a ┆ 1 ┆ (-inf, 4.5] │ + │ a ┆ 2 ┆ (-inf, 4.5] │ + │ a ┆ 3 ┆ (-inf, 4.5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (4.5, inf] │ + │ b ┆ 7 ┆ (4.5, inf] │ + │ b ┆ 8 ┆ (4.5, inf] │ + │ b ┆ 9 ┆ (4.5, inf] │ + └─────┴─────┴─────────────┘ + >>> df.with_columns(q=pl.col("x").qcut(2)) + shape: (10, 3) + ┌─────┬─────┬─────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════════════╡ + │ a ┆ 0 ┆ (-inf, 4.5] │ + │ a ┆ 1 ┆ (-inf, 4.5] │ + │ a ┆ 2 ┆ (-inf, 4.5] │ + │ a ┆ 3 ┆ (-inf, 4.5] │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ (4.5, inf] │ + │ b ┆ 7 ┆ (4.5, inf] │ + │ b ┆ 8 ┆ (4.5, inf] │ + │ b ┆ 9 ┆ (4.5, inf] │ + └─────┴─────┴─────────────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.5], ["lo", "hi"]).over("g")) + shape: (10, 3) + ┌─────┬─────┬─────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ lo │ + │ a ┆ 1 ┆ lo │ + │ a ┆ 2 ┆ lo │ + │ a ┆ 3 ┆ hi │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ lo │ + │ b ┆ 7 ┆ lo │ + │ b ┆ 8 ┆ hi │ + │ b ┆ 9 ┆ hi │ + └─────┴─────┴─────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.5], ["lo", "hi"], True).over("g")) + shape: (10, 3) + ┌─────┬─────┬─────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ cat │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ lo │ + │ a ┆ 1 ┆ lo │ + │ a ┆ 2 ┆ hi │ + │ a ┆ 3 ┆ hi │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ lo │ + │ b ┆ 7 ┆ hi │ + │ b ┆ 8 ┆ hi │ + │ b ┆ 9 ┆ hi │ + └─────┴─────┴─────┘ + >>> df.with_columns(q=pl.col("x").qcut([0.25, 0.5], include_breaks=True)) + shape: (10, 3) + ┌─────┬─────┬───────────────────────┐ + │ g ┆ x ┆ q │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ struct[2] │ + ╞═════╪═════╪═══════════════════════╡ + │ a ┆ 0 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 1 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 2 ┆ {2.25,"(-inf, 2.25]"} │ + │ a ┆ 3 ┆ {4.5,"(2.25, 4.5]"} │ + │ … ┆ … ┆ … │ + │ b ┆ 6 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 7 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 8 ┆ {inf,"(4.5, inf]"} │ + │ b ┆ 9 ┆ {inf,"(4.5, inf]"} │ + └─────┴─────┴───────────────────────┘ + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + A Struct Series containing "lengths" and "values" Fields + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.groupby("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using ``apply`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").apply(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().groupby("b", maintain_order=True).agg( + ... pl.col("a").apply(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.groupby("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.groupby("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq_missing(pl.col("y")).alias("x == y"), + ... ) + shape: (6, 3) + ┌──────┬──────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞══════╪══════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + │ null ┆ 5.0 ┆ false │ + │ null ┆ null ┆ true │ + └──────┴──────┴────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne_missing(pl.col("y")).alias("x != y"), + ... ) + shape: (6, 3) + ┌──────┬──────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞══════╪══════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + │ null ┆ 5.0 ┆ true │ + │ null ┆ null ┆ false │ + └──────┴──────┴────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + See Also + -------- + truediv + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + See Also + -------- + floordiv + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").apply(binary_string).alias("bin_x"), + ... pl.col("y").apply(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x").xor(pl.col("y")).apply(binary_string).alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Series of type List + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr that evaluates to a Boolean Series. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, *args, **kwargs) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, *args, **kwargs) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, *args, **kwargs) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, *args, **kwargs) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, *args, **kwargs) -> Self: + ''' + Compute a rolling standard deviation. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, *args, **kwargs) -> Self: + ''' + Compute a rolling variance. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, *args, **kwargs) -> Self: + ''' + Compute a rolling median. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, *args, **kwargs) -> Self: + ''' + Compute a rolling quantile. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `groupby_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster. + + Prefer: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2.0, 9.0, 2.0, 13.0], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("A").rolling_apply(lambda s: s.std(), window_size=3), + ... ] + ... ) + shape: (5, 1) + ┌──────────┐ + │ A │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 4.358899 │ + │ 4.041452 │ + │ 5.567764 │ + └──────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f32 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f32 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Series of dtype Float64 + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ..., fixed_seed: bool = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + fixed_seed + If True, The seed will not be incremented between draws. + This can make output predictable because draw ordering can + change due to threads being scheduled in a different order. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, *args, **kwargs) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + fixed_seed + If True, The seed will not be incremented between draws. + This can make output predictable because draw ordering can + change due to threads being scheduled in a different order. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count all unique values and create a struct mapping value to count. + + Parameters + ---------- + multithreaded: + Better to turn this off in the aggregation context, as it can lead to + contention. + sort: + Ensure the output is sorted from most values to least. + + Returns + ------- + Dtype Struct + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").value_counts(sort=True), + ... ] + ... ) + shape: (3, 1) + ┌───────────┐ + │ id │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {"c",3} │ + │ {"b",2} │ + │ {"a",1} │ + └───────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + This can actually hurt performance and can have a lot of contention. + It is advised not to use it until actually benchmarked on your problem. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/lazyframe/frame deleted file mode 100644 index e631bc9..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/lazyframe/frame +++ /dev/null @@ -1,128 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, CsvEncoding as CsvEncoding, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath -from typing import Any, Callable, ClassVar, Collection, Concatenate, Iterable, Literal, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: ClassVar[set[str]] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, file: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | Path | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., common_subplan_elimination: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, predicate: IntoExpr) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other: Self | list[Self]) -> Self: ... - def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: str | Sequence[str] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: ... - def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..a23782d --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/lazyframe/frame.pyi @@ -0,0 +1,3429 @@ +#: version 0.18.8 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.groupby import LazyGroupBy as LazyGroupBy +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + read_json + + """ + @classmethod + def read_json(cls, file: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + file + Path to a file or a file-like object. + + See Also + -------- + LazyFrame.from_json, LazyFrame.write_json + + """ + def __bool__(self) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Write the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.read_json + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> lf.write_json() + \'{"DataFrameScan":{"df":{"columns":[{"name":"foo","datatype":"Int64","values":[1,2,3]},{"name":"bar","datatype":"Int64","values":[6,7,8]}]},"schema":{"inner":{"foo":"Int64","bar":"Int64"}},"output_schema":null,"projection":null,"selection":null}}\' + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``False``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ groupby_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. If None and ``use_pyarrow=True``, the row group size + will be the minimum of the DataFrame size and 64 * 1024 * 1024. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + common_subplan_elimination + Will try to cache branching subplans that occur on self-joins or unions. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: IntoExpr) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a groupby operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default groupby. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.groupby("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.groupby("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.groupby(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.groupby("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_groupby`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use *groupby_dynamic*. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.groupby_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal groupby is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a groupby_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic groupby on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + groupby_rolling + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.groupby_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.groupby_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.groupby_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.groupby_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.groupby_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic groupbys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.groupby_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic groupby on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.groupby_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time" you use the following string + language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: str | Collection[str], *more_columns: str) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> lf.drop(["bar", "ham"]).collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Or use positional arguments to drop multiple columns in the same way. + + >>> lf.drop("foo", "bar").collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + ''' + Approx count unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Sequence[str] | Expr | Sequence[Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Name of the column(s) to explode. Columns must be of datatype List or Utf8. + Accepts ``col`` expressions as input as well. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: str | Sequence[str] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) to consider when identifying duplicates. + If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: str | Collection[str] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop_nulls().collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null values based + on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: str | list[str] | None = ..., value_vars: str | list[str] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.melt(id_vars="a", value_vars=["b", "c"]).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/series/series deleted file mode 100644 index b50501b..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/series/series +++ /dev/null @@ -1,345 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TimeUnit as TimeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.decorators import deprecated_alias as deprecated_alias -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, find_stacklevel as find_stacklevel, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Generator, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: ClassVar[set[str]] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> int: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - @property - def time_unit(self) -> TimeUnit | None: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Self: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Self: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Self: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Self: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Self: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Self: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - @overload - def __floordiv__(self, other: Expr) -> Expr: ... - @overload - def __floordiv__(self, other: Any) -> Series: ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def cbrt(self) -> Series: ... - def any(self, drop_nulls: bool = ...) -> bool | None: ... - def all(self, drop_nulls: bool = ...) -> bool | None: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | None | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - def cut(self, breaks: list[float], labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, series: bool = ..., left_closed: bool = ..., include_breaks: bool = ...) -> DataFrame | Series: ... - def qcut(self, q: list[float] | int, *, labels: list[str] | None = ..., break_point_label: str = ..., category_label: str = ..., series: bool = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> DataFrame | Series: ... - def rle(self) -> Series: ... - def rle_id(self) -> Series: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str, *, in_place: bool | None = ...) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool | None = ...) -> Self: ... - def extend(self, other: Series) -> Self: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series[Any]: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/series/series.pyi new file mode 100644 index 0000000..0a6610a --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.18.8/polars/series/series.pyi @@ -0,0 +1,4353 @@ +#: version 0.18.8 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.decorators import deprecated_alias as deprecated_alias +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, find_stacklevel as find_stacklevel, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> int: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the + ``Series`` contains multiple chunks + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Self: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self | Expr: ... + def __ne__(self, other: Any) -> Self | Expr: ... + def __gt__(self, other: Any) -> Self | Expr: ... + def __lt__(self, other: Any) -> Self | Expr: ... + def __ge__(self, other: Any) -> Self | Expr: ... + def __le__(self, other: Any) -> Self | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + """ + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + """ + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self, drop_nulls: bool = ...) -> bool | None: + """ + Check if any boolean value in the column is `True`. + + Returns + ------- + Boolean literal + + """ + def all(self, drop_nulls: bool = ...) -> bool | None: + """ + Check if all boolean values in the column are `True`. + + Returns + ------- + Boolean literal + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + Dictionary with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ max ┆ 5.0 │ + │ median ┆ 3.0 │ + │ 25% ┆ 2.0 │ + │ 75% ┆ 4.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, *args, **kwargs) -> DataFrame | Series: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + A list of unique cut points. + labels + Labels to assign to the bins. If given the length of labels must be + len(breaks) + 1. + break_point_label + Name given to the breakpoint column/field. Only used if series == False or + include_breaks == True + category_label + Name given to the category column. Only used if series == False + series + If True, return the a categorical series in the data\'s original order. + left_closed + Whether intervals should be [) instead of (] + include_breaks + Include the the right endpoint of the bin each observation falls in. + If returning a DataFrame, it will be a column, and if returning a Series + it will be a field in a Struct + + Returns + ------- + DataFrame or Series + + Examples + -------- + >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) + >>> a.cut([-1, 1], series=False) + shape: (12, 3) + ┌──────┬─────────────┬────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ cat │ + ╞══════╪═════════════╪════════════╡ + │ -3.0 ┆ -1.0 ┆ (-inf, -1] │ + │ -2.5 ┆ -1.0 ┆ (-inf, -1] │ + │ -2.0 ┆ -1.0 ┆ (-inf, -1] │ + │ -1.5 ┆ -1.0 ┆ (-inf, -1] │ + │ … ┆ … ┆ … │ + │ 1.0 ┆ 1.0 ┆ (-1, 1] │ + │ 1.5 ┆ inf ┆ (1, inf] │ + │ 2.0 ┆ inf ┆ (1, inf] │ + │ 2.5 ┆ inf ┆ (1, inf] │ + └──────┴─────────────┴────────────┘ + >>> a.cut([-1, 1], series=True) + shape: (12,) + Series: \'a\' [cat] + [ + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-inf, -1]" + "(-1, 1]" + "(-1, 1]" + "(-1, 1]" + "(-1, 1]" + "(1, inf]" + "(1, inf]" + "(1, inf]" + ] + >>> a.cut([-1, 1], series=True, left_closed=True) + shape: (12,) + Series: \'a\' [cat] + [ + "[-inf, -1)" + "[-inf, -1)" + "[-inf, -1)" + "[-inf, -1)" + "[-1, 1)" + "[-1, 1)" + "[-1, 1)" + "[-1, 1)" + "[1, inf)" + "[1, inf)" + "[1, inf)" + "[1, inf)" + ] + ''' + def qcut(self, *args, **kwargs) -> DataFrame | Series: + ''' + Discretize continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + q + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of evenly spaced probabilities to use. + labels + Labels to assign to the quantiles. If given the length of labels must be + len(breaks) + 1. + break_point_label + Name given to the breakpoint column/field. Only used if series == False or + include_breaks == True + category_label + Name given to the category column. Only used if series == False. + series + If True, return a categorical series in the data\'s original order + left_closed + Whether intervals should be [) instead of (] + allow_duplicates + If True, the resulting quantile breaks don\'t have to be unique. This can + happen even with unique probs depending on the data. Duplicates will be + dropped, resulting in fewer bins. + include_breaks + Include the the right endpoint of the bin each observation falls in. + If returning a DataFrame, it will be a column, and if returning a Series + it will be a field in a Struct + + Returns + ------- + DataFrame or Series + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", range(-5, 3)) + >>> a.qcut(2, series=True) + shape: (8,) + Series: \'a\' [cat] + [ + "(-inf, -1.5]" + "(-inf, -1.5]" + "(-inf, -1.5]" + "(-inf, -1.5]" + "(-1.5, inf]" + "(-1.5, inf]" + "(-1.5, inf]" + "(-1.5, inf]" + ] + >>> a.qcut([0.0, 0.25, 0.75], series=False) + shape: (8, 3) + ┌─────┬─────────────┬───────────────┐ + │ a ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪═══════════════╡ + │ -5 ┆ -5.0 ┆ (-inf, -5] │ + │ -4 ┆ -3.25 ┆ (-5, -3.25] │ + │ -3 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -2 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ -1 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 0 ┆ 0.25 ┆ (-3.25, 0.25] │ + │ 1 ┆ inf ┆ (0.25, inf] │ + │ 2 ┆ inf ┆ (0.25, inf] │ + └─────┴─────────────┴───────────────┘ + >>> a.qcut([0.0, 0.25, 0.75], series=True) + shape: (8,) + Series: \'a\' [cat] + [ + "(-inf, -5]" + "(-5, -3.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(-3.25, 0.25]" + "(0.25, inf]" + "(0.25, inf]" + ] + >>> a.qcut([0.0, 0.25, 0.75], series=True, left_closed=True) + shape: (8,) + Series: \'a\' [cat] + [ + "[-5, -3.25)" + "[-5, -3.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[-3.25, 0.25)" + "[0.25, inf)" + "[0.25, inf)" + ] + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + A Struct Series containing "lengths" and "values" Fields + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the unique values in a Series. + + Parameters + ---------- + sort + Ensure the output is sorted from most values to least. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.value_counts().sort(by="a") + shape: (3, 2) + ┌─────┬────────┐ + │ a ┆ counts │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + │ 3 ┆ 1 │ + └─────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a groupby or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Parameters + ---------- + name + New name. + in_place + Modify the Series in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and ``append`` will change to always + behave like ``append_chunks=True`` (the previous default). For the + behavior of ``append_chunks=False``, use ``Series.extend``. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from ``append``, which adds the chunks from ``other`` to the chunks of + this series, ``extend`` appends the data from ``other`` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``append`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer ``append`` over ``extend`` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single ``Series``. In the latter case, finish the sequence + of ``append`` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + Integer + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + UInt32 Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Boolean Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Exploded Series of same dtype + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done for instance due to an overflow. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. This operation clones data but is completely safe. + + If you want a zero-copy view and know what you are doing, use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + the series mutated + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.apply(lambda x: x + 10) + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + New Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a custom rolling window function. + + Prefer the specific rolling window functions over this one, as they are faster: + + * rolling_min + * rolling_max + * rolling_mean + * rolling_sum + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("A", [11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> print(s.rolling_apply(function=np.nanstd, window_size=3)) + shape: (5,) + Series: \'A\' [f64] + [ + null + null + 3.858612 + 3.5 + 0.5 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, *args, **kwargs) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f32] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a flat Series of shape (len,). + If a multiple dimensions are given, results in a Series of Lists with shape + (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def time_unit(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/dataframe/frame deleted file mode 100644 index cf412d8..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/dataframe/frame +++ /dev/null @@ -1,299 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase, TextIOWrapper -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions import col as col, lit as lit -from polars.interchange.dataframe import PolarsDataFrame as PolarsDataFrame -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnFormatDict as ColumnFormatDict, ColumnNameOrSelector as ColumnNameOrSelector, ColumnTotalsDefinition as ColumnTotalsDefinition, ColumnWidthsDefinition as ColumnWidthsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IndexOrder as IndexOrder, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from typing import Any, BinaryIO, Callable, ClassVar, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: ClassVar[set[str]] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., schema: None | SchemaDict = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ..., ignore_errors: bool = ...) -> Self: ... - def _replace(self, column: str, new_column: Series) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def flags(self) -> dict[str, dict[str, bool]]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Any]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ..., *, order: IndexOrder = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path, *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: ColumnFormatDict | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., header_format: dict[str, Any] | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: ColumnWidthsDefinition | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ..., freeze_panes: str | tuple[int, int] | tuple[str, int, int] | tuple[int, int, int, int] | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - @overload - def write_ipc_stream(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, Any] | None = ...) -> None: ... - def write_database(self, table_name: str, connection: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: str | Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, other: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: DataFrame) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> DataFrame: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., *, separator: str = ..., drop_first: bool = ...) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def approx_n_unique(self) -> DataFrame: ... - def approx_unique(self) -> DataFrame: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *, named: bool = ..., include_key: bool = ..., unique: bool = ...) -> dict[Any, Iterable[Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def corr(self, **kwargs: Any) -> DataFrame: ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/dataframe/frame.pyi new file mode 100644 index 0000000..7114da1 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/dataframe/frame.pyi @@ -0,0 +1,6578 @@ +#: version 0.19.0 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to ``True`` will raise a ``NotImplementedError``. + allow_copy + Allow memory to be copied to perform the conversion. If set to ``False``, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars dataframe to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Any]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + ``structured`` is set to ``False`` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname(s):str,}`` or ``{selector:str,}`` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in ``dtype_formats``. + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A ``{key:value,}`` dictionary of ``xlsxwriter`` format options to apply + to the table header row, such as ``{"bold":True, "font_color":"#702963"}``. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` or ``{selector:int,}`` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible ``xlsxwriter`` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path to which the IPC record batch data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + If you pass ``partition_cols`` here, the dataset will be written + using ``pyarrow.parquet.write_to_dataset``. + The ``partition_cols`` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, *args, **kwargs) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Some other data types are not supported but have an associated `primitive type + `__ + to which they can be cast. This affects the following data types: + + - Unsigned integers + - :class:`Datetime` types with millisecond or nanosecond precision or with + time zone information + - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a dataframe as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, *args, **kwargs) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``group_by_dynamic`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.group_by_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, *args, **kwargs) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from ``vstack`` which adds the chunks from ``other`` to the chunks of + this ``DataFrame``, ``extend`` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``vstack`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer ``vstack`` over ``extend`` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single ``DataFrame``. In the latter case, finish the sequence of + ``vstack`` operations with a ``rechunk``. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self, *args, **kwargs) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`DataFrame.approx_n_unique`. + + """ + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using ``iter_rows`` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy ``corrcoef``. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def groupby(self, *args, **kwargs) -> GroupBy: + """ + Start a group by operation. + + Alias for :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, *args, **kwargs) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + Alias for :func:`DataFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, *args, **kwargs) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Alias for :func:`DataFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, *args, **kwargs) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/expr/expr deleted file mode 100644 index 7f1203f..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/expr/expr +++ /dev/null @@ -1,268 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, MapElementsStrategy as MapElementsStrategy, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: ClassVar[set[str]] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int | bool) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int | bool) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr | int | bool) -> Self: ... - def __rxor__(self, other: Any) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self, *, ignore_nulls: bool = ...) -> Self: ... - def all(self, *, ignore_nulls: bool = ...) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def cbrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def keep_name(self) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: IntoExpr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: ... - def qcut(self, quantiles: Sequence[float] | int, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> Self: ... - def rle(self) -> Self: ... - def rle_id(self) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | None | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def degrees(self) -> Self: ... - def radians(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/expr/expr.pyi new file mode 100644 index 0000000..dd2cb23 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/expr/expr.pyi @@ -0,0 +1,7778 @@ +#: version 0.19.0 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self, *args, **kwargs) -> Self: + ''' + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self, *args, **kwargs) -> Self: + ''' + Return whether all values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().map_alias(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").keep_name()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to ``True``, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a Series or sequence of Series. + + The output of this custom function must be a Series. + If you want to apply a custom function elementwise over single values, see + :func:`apply`. A use case for ``map`` is when you want to transform an + expression with a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + Depending on the context it has the following behavior: + + * Selection + Expects `f` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `f` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Parameters + ---------- + function + Lambda/ function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don\'t map the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {\'thread_local\', \'threading\'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using ``map`` is strongly discouraged as you will be effectively running + python "for" loops. This will be very slow. Wherever possible you should + strongly prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().group_by("b", maintain_order=True).agg( + ... pl.col("a").map_elements(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.group_by("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, *args, **kwargs) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, *args, **kwargs) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, *args, **kwargs) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, *args, **kwargs) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, *args, **kwargs) -> Self: + ''' + Compute a rolling standard deviation. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, *args, **kwargs) -> Self: + ''' + Compute a rolling variance. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, *args, **kwargs) -> Self: + ''' + Compute a rolling median. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, *args, **kwargs) -> Self: + ''' + Compute a rolling quantile. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the ``window_size - 1`` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self, *args, **kwargs) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self, *args, **kwargs) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def map(self, *args, **kwargs) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, *args, **kwargs) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, *args, **kwargs) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/lazyframe/frame deleted file mode 100644 index fd22e6d..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/lazyframe/frame +++ /dev/null @@ -1,156 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import dataframe_api_compat as dataframe_api_compat, subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, ColumnNameOrSelector as ColumnNameOrSelector, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._async import _AsyncDataFrameResult as _AsyncDataFrameResult -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath -from queue import Queue -from typing import Any, Callable, ClassVar, Collection, Concatenate, Iterable, Literal, Mapping, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: ClassVar[set[str]] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., schema: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def deserialize(cls, source: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def _comparison_error(self, operator: str) -> NoReturn: ... - def __eq__(self, other: Any) -> NoReturn: ... - def __ne__(self, other: Any) -> NoReturn: ... - def __gt__(self, other: Any) -> NoReturn: ... - def __lt__(self, other: Any) -> NoReturn: ... - def __ge__(self, other: Any) -> NoReturn: ... - def __le__(self, other: Any) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def serialize(self, file: None = ...) -> str: ... - @overload - def serialize(self, file: IOBase | str | Path) -> None: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | Path | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def collect_async(self, queue: Queue[DataFrame | Exception], *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> _AsyncDataFrameResult[DataFrame]: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_csv(self, path: str | Path, *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, predicate: IntoExpr) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other: Self | list[Self]) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: ... - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map_batches(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..2c5b2ba --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/lazyframe/frame.pyi @@ -0,0 +1,3971 @@ +#: version 0.19.0 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AsyncDataFrameResult as _AsyncDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, *args, **kwargs) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to ``StringIO`` + and then use ``LazyFrame.deserialize``. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, *args, **kwargs) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to ``deserialize``. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"LocalProjection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def write_json(self, *args, **kwargs) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.serialize`. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + """ + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self, *args, **kwargs) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``True``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self, *args, **kwargs) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self, *args, **kwargs) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self, *args, **kwargs) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self, queue: Queue[DataFrame | Exception]) -> _AsyncDataFrameResult[DataFrame]: + ''' + Collect dataframe asynchronously in thread pool. + + Collects into a DataFrame, like :func:`collect` + but instead of returning dataframe directly its collected inside thread pool + and gets put into `queue` with `put_nowait` method, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + You must use correct queue in that case. + Given `queue` must be thread safe! + + For gevent use + [`gevent.queue.Queue`](https://www.gevent.org/api/gevent.queue.html#gevent.queue.Queue). + + For asyncio + [`asyncio.queues.Queue`](https://docs.python.org/3/library/asyncio-queue.html#queue) + can not be used, since it\'s not thread safe! + For that purpose use [janus](https://github.com/aio-libs/janus) library. + + Notes + ----- + Results are put in queue exactly once using `put_nowait`. + If error occurred then Exception will be put in the queue instead of result + which is then raised by returned wrapper `get` method. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + Wrapper that has `get` method and `queue` attribute with given queue. + `get` accepts kwargs that are passed down to `queue.get`. + + Examples + -------- + >>> import queue + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> a = ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async(queue.Queue()) + ... ) + >>> a.get() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def fetch(self, *args, **kwargs) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: IntoExpr) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_group_by`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.group_by_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_rolling + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self, *args, **kwargs) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.approx_n_unique`. + + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> lf.map_batches(lambda x: 2 * x).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def groupby(self, *args, **kwargs) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, *args, **kwargs) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, *args, **kwargs) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, *args, **kwargs) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/series/series deleted file mode 100644 index 0c98cdf..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/series/series +++ /dev/null @@ -1,362 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.deprecation import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Generator, Literal, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: ClassVar[set[str]] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> tuple[int, int, int]: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Series: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Self: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Self: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Self: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Self: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Self: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - @overload - def __floordiv__(self, other: Expr) -> Expr: ... - @overload - def __floordiv__(self, other: Any) -> Series: ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def __column_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def cbrt(self) -> Series: ... - @overload - def any(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def any(self, *, ignore_nulls: bool) -> bool | None: ... - @overload - def all(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def all(self, *, ignore_nulls: bool) -> bool | None: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | None | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: bool) -> Series | DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: bool) -> Series | DataFrame: ... - def rle(self) -> Series: ... - def rle_id(self) -> Series: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool | None = ...) -> Self: ... - def extend(self, other: Series) -> Self: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series[Any]: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/series/series.pyi new file mode 100644 index 0000000..43a7728 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.0/polars/series/series.pyi @@ -0,0 +1,4569 @@ +#: version 0.19.0 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.deprecation import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the ``Series`` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self | Expr: ... + def __ne__(self, other: Any) -> Self | Expr: ... + def __gt__(self, other: Any) -> Self | Expr: ... + def __lt__(self, other: Any) -> Self | Expr: ... + def __ge__(self, other: Any) -> Self | Expr: ... + def __le__(self, other: Any) -> Self | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator ``series == other`` where `None` == None`. + + This differs from the standard ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator ``series != other`` where `None` == None`. + + This differs from the standard ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self, *args, **kwargs) -> bool | None: + """ + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self, *args, **kwargs) -> bool | None: + """ + Return whether all values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + category_label + Name of the category column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to ``False``, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting ``include_breaks=True``, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, *args, **kwargs) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to ``True``, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + category_label + Name of the category column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + as_series + If set to ``False``, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting ``include_breaks=True``, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴────────┘ + + Sort the output by count. + + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and ``append`` will change to always + behave like ``append_chunks=True`` (the previous default). For the + behavior of ``append_chunks=False``, use ``Series.extend``. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from ``append``, which adds the chunks from ``other`` to the chunks of + this series, ``extend`` appends the data from ``other`` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``append`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer ``append`` over ``extend`` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single ``Series``. In the latter case, finish the sequence + of ``append`` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point ``nan`` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set ``zero_copy_only=True``. + + Alternatively, if you want a zero-copy view and know what you are doing, + use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + Series + The mutated series. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + With suitable data you may achieve order-of-magnitude speedups (or more). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the ``window_size - 1`` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, *args, **kwargs) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/dataframe/frame deleted file mode 100644 index 1c5d3f6..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/dataframe/frame +++ /dev/null @@ -1,300 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase, TextIOWrapper -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions import col as col, lit as lit -from polars.interchange.dataframe import PolarsDataFrame as PolarsDataFrame -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnFormatDict as ColumnFormatDict, ColumnNameOrSelector as ColumnNameOrSelector, ColumnTotalsDefinition as ColumnTotalsDefinition, ColumnWidthsDefinition as ColumnWidthsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IndexOrder as IndexOrder, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from typing import Any, BinaryIO, Callable, ClassVar, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: ClassVar[set[str]] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., schema: None | SchemaDict = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ..., ignore_errors: bool = ...) -> Self: ... - def _replace(self, column: str, new_column: Series) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def flags(self) -> dict[str, dict[str, bool]]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Series]: ... - def __reversed__(self) -> Iterator[Series]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ..., *, order: IndexOrder = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path, *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: ColumnFormatDict | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., header_format: dict[str, Any] | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: ColumnWidthsDefinition | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ..., freeze_panes: str | tuple[int, int] | tuple[str, int, int] | tuple[int, int, int, int] | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - @overload - def write_ipc_stream(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, Any] | None = ...) -> None: ... - def write_database(self, table_name: str, connection: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: str | Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: ... - @overload - def glimpse(self, *, return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, other: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: DataFrame) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> DataFrame: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., *, separator: str = ..., drop_first: bool = ...) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def approx_n_unique(self) -> DataFrame: ... - def approx_unique(self) -> DataFrame: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *, named: bool = ..., include_key: bool = ..., unique: bool = ...) -> dict[Any, Iterable[Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def corr(self, **kwargs: Any) -> DataFrame: ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/dataframe/frame.pyi new file mode 100644 index 0000000..4abba1a --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/dataframe/frame.pyi @@ -0,0 +1,6579 @@ +#: version 0.19.1 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, DataTypeClass as DataTypeClass, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.lazy import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to ``True`` will raise a ``NotImplementedError``. + allow_copy + Allow memory to be copied to perform the conversion. If set to ``False``, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars dataframe to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + ``structured`` is set to ``False`` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname(s):str,}`` or ``{selector:str,}`` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in ``dtype_formats``. + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A ``{key:value,}`` dictionary of ``xlsxwriter`` format options to apply + to the table header row, such as ``{"bold":True, "font_color":"#702963"}``. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` or ``{selector:int,}`` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible ``xlsxwriter`` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path to which the IPC record batch data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + If you pass ``partition_cols`` here, the dataset will be written + using ``pyarrow.parquet.write_to_dataset``. + The ``partition_cols`` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, *args, **kwargs) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Some other data types are not supported but have an associated `primitive type + `__ + to which they can be cast. This affects the following data types: + + - Unsigned integers + - :class:`Datetime` types with millisecond or nanosecond precision or with + time zone information + - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a dataframe as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the dataframe. + + The formatting is done one line per column, so wide dataframes show nicely. + Each line will show the column name, the data type and the first few values. + + Parameters + ---------- + return_as_string + If True, return as string rather than printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, b, c + $ e usd, eur, None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, *args, **kwargs) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``group_by_dynamic`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.group_by_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, *args, **kwargs) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from ``vstack`` which adds the chunks from ``other`` to the chunks of + this ``DataFrame``, ``extend`` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``vstack`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer ``vstack`` over ``extend`` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single ``DataFrame``. In the latter case, finish the sequence of + ``vstack`` operations with a ``rechunk``. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self, *args, **kwargs) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`DataFrame.approx_n_unique`. + + """ + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using ``iter_rows`` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy ``corrcoef``. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def groupby(self, *args, **kwargs) -> GroupBy: + """ + Start a group by operation. + + Alias for :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, *args, **kwargs) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + Alias for :func:`DataFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, *args, **kwargs) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Alias for :func:`DataFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, *args, **kwargs) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/expr/expr deleted file mode 100644 index 546c9f9..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/expr/expr +++ /dev/null @@ -1,269 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, MapElementsStrategy as MapElementsStrategy, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: ClassVar[set[str]] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int | bool) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int | bool) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr | int | bool) -> Self: ... - def __rxor__(self, other: Any) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self, *, ignore_nulls: bool = ...) -> Self: ... - def all(self, *, ignore_nulls: bool = ...) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def cbrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def keep_name(self) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def is_not(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: IntoExpr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_last(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: ... - def qcut(self, quantiles: Sequence[float] | int, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> Self: ... - def rle(self) -> Self: ... - def rle_id(self) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | None | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def degrees(self) -> Self: ... - def radians(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/expr/expr.pyi new file mode 100644 index 0000000..365dbdf --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/expr/expr.pyi @@ -0,0 +1,7848 @@ +#: version 0.19.1 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self, *args, **kwargs) -> Self: + ''' + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self, *args, **kwargs) -> Self: + ''' + Return whether all values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().map_alias(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").keep_name()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").is_not()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original dataframe. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_last(self) -> Self: + ''' + Get a mask of the last unique value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_last().alias("is_last")) + shape: (5, 2) + ┌─────┬─────────┐ + │ num ┆ is_last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ true │ + │ 5 ┆ true │ + └─────┴─────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to ``True``, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for ``map`` functions is transforming the values + represented by an expression using a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Notes + ----- + If you are looking to map a function over a window function or groupby context, + refer to func:`map_elements` instead. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + map_elements + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + Depending on the context it has the following behavior: + + * Selection + Expects `function` to be of type Callable[[Any], Any]. + Applies a python function over each individual value in the column. + * GroupBy + Expects `function` to be of type Callable[[Series], Series]. + Applies a python function over each group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be ``pl.Unknown``. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using ``map_elements`` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using ``over`` is considered a GroupBy context + here, so ``map_elements`` can be used to map functions over window groups. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + In a selection context, the function is applied by row. + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + It is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context the function is applied by group: + + >>> df.lazy().group_by("b", maintain_order=True).agg( + ... pl.col("a").map_elements(lambda x: x.sum()) + ... ).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + It is better to implement this with an expression: + + >>> df.group_by("b", maintain_order=True).agg( + ... pl.col("a").sum(), + ... ) # doctest: +IGNORE_RESULT + + Window function application using ``over`` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, *args, **kwargs) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, *args, **kwargs) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, *args, **kwargs) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, *args, **kwargs) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, *args, **kwargs) -> Self: + ''' + Compute a rolling standard deviation. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, *args, **kwargs) -> Self: + ''' + Compute a rolling variance. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, *args, **kwargs) -> Self: + ''' + Compute a rolling median. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, *args, **kwargs) -> Self: + ''' + Compute a rolling quantile. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the ``window_size - 1`` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self, *args, **kwargs) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self, *args, **kwargs) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def map(self, *args, **kwargs) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, *args, **kwargs) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, *args, **kwargs) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/lazyframe/frame deleted file mode 100644 index fd22e6d..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/lazyframe/frame +++ /dev/null @@ -1,156 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import dataframe_api_compat as dataframe_api_compat, subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, ColumnNameOrSelector as ColumnNameOrSelector, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._async import _AsyncDataFrameResult as _AsyncDataFrameResult -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath -from queue import Queue -from typing import Any, Callable, ClassVar, Collection, Concatenate, Iterable, Literal, Mapping, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: ClassVar[set[str]] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., schema: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def deserialize(cls, source: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def _comparison_error(self, operator: str) -> NoReturn: ... - def __eq__(self, other: Any) -> NoReturn: ... - def __ne__(self, other: Any) -> NoReturn: ... - def __gt__(self, other: Any) -> NoReturn: ... - def __lt__(self, other: Any) -> NoReturn: ... - def __ge__(self, other: Any) -> NoReturn: ... - def __le__(self, other: Any) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def serialize(self, file: None = ...) -> str: ... - @overload - def serialize(self, file: IOBase | str | Path) -> None: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | Path | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def collect_async(self, queue: Queue[DataFrame | Exception], *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> _AsyncDataFrameResult[DataFrame]: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_csv(self, path: str | Path, *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, predicate: IntoExpr) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other: Self | list[Self]) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: ... - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map_batches(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..dc77c12 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/lazyframe/frame.pyi @@ -0,0 +1,3981 @@ +#: version 0.19.1 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AsyncDataFrameResult as _AsyncDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, *args, **kwargs) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to ``StringIO`` + and then use ``LazyFrame.deserialize``. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, *args, **kwargs) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to ``deserialize``. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"LocalProjection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def write_json(self, *args, **kwargs) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.serialize`. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + """ + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self, *args, **kwargs) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``True``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self, *args, **kwargs) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self, *args, **kwargs) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self, *args, **kwargs) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self, queue: Queue[DataFrame | Exception]) -> _AsyncDataFrameResult[DataFrame]: + ''' + Collect dataframe asynchronously in thread pool. + + Collects into a DataFrame, like :func:`collect` + but instead of returning dataframe directly its collected inside thread pool + and gets put into `queue` with `put_nowait` method, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + You must use correct queue in that case. + Given `queue` must be thread safe! + + For gevent use + [`gevent.queue.Queue`](https://www.gevent.org/api/gevent.queue.html#gevent.queue.Queue). + + For asyncio + [`asyncio.queues.Queue`](https://docs.python.org/3/library/asyncio-queue.html#queue) + can not be used, since it\'s not thread safe! + For that purpose use [janus](https://github.com/aio-libs/janus) library. + + Notes + ----- + Results are put in queue exactly once using `put_nowait`. + If error occurred then Exception will be put in the queue instead of result + which is then raised by returned wrapper `get` method. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + Wrapper that has `get` method and `queue` attribute with given queue. + `get` accepts kwargs that are passed down to `queue.get`. + + Examples + -------- + >>> import queue + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> a = ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async(queue.Queue()) + ... ) + >>> a.get() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def fetch(self, *args, **kwargs) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: IntoExpr) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_group_by`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.group_by_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_rolling + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").is_not()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").is_not(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self, *args, **kwargs) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.approx_n_unique`. + + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def groupby(self, *args, **kwargs) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, *args, **kwargs) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, *args, **kwargs) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, *args, **kwargs) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/series/series deleted file mode 100644 index 4cc06bc..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/series/series +++ /dev/null @@ -1,363 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.deprecation import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Generator, Literal, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: ClassVar[set[str]] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> tuple[int, int, int]: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Series: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Self: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Self: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Self: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Self: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Self: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - @overload - def __floordiv__(self, other: Expr) -> Expr: ... - @overload - def __floordiv__(self, other: Any) -> Series: ... - def __invert__(self) -> Self: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def __column_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def cbrt(self) -> Series: ... - @overload - def any(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def any(self, *, ignore_nulls: bool) -> bool | None: ... - @overload - def all(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def all(self, *, ignore_nulls: bool) -> bool | None: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | None | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: bool) -> Series | DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: bool) -> Series | DataFrame: ... - def rle(self) -> Series: ... - def rle_id(self) -> Series: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool | None = ...) -> Self: ... - def extend(self, other: Series) -> Self: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_last(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series[Any]: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/series/series.pyi new file mode 100644 index 0000000..6fa1611 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.1/polars/series/series.pyi @@ -0,0 +1,4578 @@ +#: version 0.19.1 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.deprecation import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the ``Series`` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Self: ... + def __eq__(self, other: Any) -> Self | Expr: ... + def __ne__(self, other: Any) -> Self | Expr: ... + def __gt__(self, other: Any) -> Self | Expr: ... + def __lt__(self, other: Any) -> Self | Expr: ... + def __ge__(self, other: Any) -> Self | Expr: ... + def __le__(self, other: Any) -> Self | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator ``series == other`` where `None` == None`. + + This differs from the standard ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator ``series != other`` where `None` == None`. + + This differs from the standard ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Self: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self, *args, **kwargs) -> bool | None: + """ + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self, *args, **kwargs) -> bool | None: + """ + Return whether all values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + category_label + Name of the category column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to ``False``, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting ``include_breaks=True``, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, *args, **kwargs) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to ``True``, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + category_label + Name of the category column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + as_series + If set to ``False``, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting ``include_breaks=True``, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴────────┘ + + Sort the output by count. + + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and ``append`` will change to always + behave like ``append_chunks=True`` (the previous default). For the + behavior of ``append_chunks=False``, use ``Series.extend``. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from ``append``, which adds the chunks from ``other`` to the chunks of + this series, ``extend`` appends the data from ``other`` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``append`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer ``append`` over ``extend`` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single ``Series``. In the latter case, finish the sequence + of ``append`` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Get a mask of the last unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point ``nan`` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set ``zero_copy_only=True``. + + Alternatively, if you want a zero-copy view and know what you are doing, + use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + Series + The mutated series. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the ``window_size - 1`` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, *args, **kwargs) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/dataframe/frame deleted file mode 100644 index 9a9b851..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/dataframe/frame +++ /dev/null @@ -1,302 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase, TextIOWrapper -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions import col as col, lit as lit -from polars.interchange.dataframe import PolarsDataFrame as PolarsDataFrame -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte -from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnFormatDict as ColumnFormatDict, ColumnNameOrSelector as ColumnNameOrSelector, ColumnTotalsDefinition as ColumnTotalsDefinition, ColumnWidthsDefinition as ColumnWidthsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IndexOrder as IndexOrder, IntoExpr as IntoExpr, IntoExprColumn as IntoExprColumn, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Label as Label, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SelectorType as SelectorType, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from typing import Any, BinaryIO, Callable, ClassVar, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: ClassVar[set[str]] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., schema: None | SchemaDict = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ..., ignore_errors: bool = ...) -> Self: ... - def _replace(self, column: str, new_column: Series) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def flags(self) -> dict[str, dict[str, bool]]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, *, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state: list[Series]) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Series]: ... - def __reversed__(self) -> Iterator[Series]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ..., *, order: IndexOrder = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote_char: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path, *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote_char: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: ColumnFormatDict | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., header_format: dict[str, Any] | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: ColumnWidthsDefinition | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | SelectorType | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ..., freeze_panes: str | tuple[int, int] | tuple[str, int, int] | tuple[int, int, int, int] | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - @overload - def write_ipc_stream(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, Any] | None = ...) -> None: ... - def write_database(self, table_name: str, connection: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: str | Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: ... - @overload - def glimpse(self, *, max_items_per_column: int = ..., max_colname_length: int = ..., return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, max_items_per_column: int = ..., max_colname_length: int = ..., return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool | None = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., label: Label = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, other: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: DataFrame) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> DataFrame: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, n: int = ...) -> DataFrame: ... - def shift_and_fill(self, fill_value: int | str | float, *, n: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., *, separator: str = ..., drop_first: bool = ...) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def approx_n_unique(self) -> DataFrame: ... - def approx_unique(self) -> DataFrame: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | Series | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *, named: bool = ..., include_key: bool = ..., unique: bool = ...) -> dict[Any, Iterable[Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def corr(self, **kwargs: Any) -> DataFrame: ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/dataframe/frame.pyi new file mode 100644 index 0000000..da0ad16 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/dataframe/frame.pyi @@ -0,0 +1,6756 @@ +#: version 0.19.11 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to ``True`` will raise a ``NotImplementedError``. + allow_copy + Allow memory to be copied to perform the conversion. If set to ``False``, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars dataframe to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + ``structured`` is set to ``False`` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + separator or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname(s):str,}`` or ``{selector:str,}`` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in ``dtype_formats``. + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A ``{key:value,}`` dictionary of ``xlsxwriter`` format options to apply + to the table header row, such as ``{"bold":True, "font_color":"#702963"}``. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` or ``{selector:int,}`` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible ``xlsxwriter`` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path to which the IPC record batch data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + If you pass ``partition_cols`` here, the dataset will be written + using ``pyarrow.parquet.write_to_dataset``. + The ``partition_cols`` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, table_name: str, connection: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Some other data types are not supported but have an associated `primitive type + `__ + to which they can be cast. This affects the following data types: + + - Unsigned integers + - :class:`Datetime` types with millisecond or nanosecond precision or with + time zone information + - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a dataframe as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") > 1) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions, combined with and/or operators: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> df.filter( + ... pl.col("foo") <= 2, + ... ~pl.col("ham").is_in(["b", "c"]), + ... ) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> df.filter(foo=2, ham="b") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``group_by_dynamic`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling operation on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is ``\'window\'``. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, other: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from ``vstack`` which adds the chunks from ``other`` to the chunks of + this ``DataFrame``, ``extend`` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``vstack`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer ``vstack`` over ``extend`` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single ``DataFrame``. In the latter case, finish the sequence of + ``vstack`` operations with a ``rechunk``. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this DataFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Returns + ------- + Series + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> df.melt(id_vars="a", value_vars=cs.numeric()) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, n: int = ...) -> DataFrame: + ''' + Shift values by the given number of places. + + Parameters + ---------- + n + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + ''' + Shift values by the given number of places and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + n + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(n=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`DataFrame.approx_n_unique`. + + """ + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using ``iter_rows`` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy ``corrcoef``. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the values in `other`. + + By default, null values in the right dataframe are ignored. Use + `ignore_nulls=False` to overwrite values in this frame with null values in other + frame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> df.update(new_df, how="inner") + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update( + ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + """ + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is ``\'window\'``. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/expr/expr deleted file mode 100644 index e47e63b..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/expr/expr +++ /dev/null @@ -1,277 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, IntoExprColumn as IntoExprColumn, MapElementsStrategy as MapElementsStrategy, NullBehavior as NullBehavior, NumericLiteral as NumericLiteral, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, TemporalLiteral as TemporalLiteral, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: ClassVar[set[str]] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int | bool) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int | bool) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr | int | bool) -> Self: ... - def __rxor__(self, other: Any) -> Self: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self, *, ignore_nulls: bool = ...) -> Self: ... - def all(self, *, ignore_nulls: bool = ...) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def cbrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def keep_name(self) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def is_not(self) -> Self: ... - def not_(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: IntoExpr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int | IntoExprColumn = ...) -> Self: ... - def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, n: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, n: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def rolling(self, index_column: str, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., check_sorted: bool = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first_distinct(self) -> Self: ... - def is_last_distinct(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: ... - def qcut(self, quantiles: Sequence[float] | int, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> Self: ... - def rle(self) -> Self: ... - def rle_id(self) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | None | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int | IntoExprColumn = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: ... - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: ... - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def cot(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def degrees(self) -> Self: ... - def radians(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | IntoExprColumn | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def is_first(self) -> Self: ... - def is_last(self) -> Self: ... - def _register_plugin(self, *, lib: str, symbol: str, args: list[IntoExpr] | None = ..., kwargs: dict[Any, Any] | None = ..., is_elementwise: bool = ..., input_wildcard_expansion: bool = ..., auto_explode: bool = ..., cast_to_supertypes: bool = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/expr/expr.pyi new file mode 100644 index 0000000..b50bc53 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/expr/expr.pyi @@ -0,0 +1,8130 @@ +#: version 0.19.11 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self) -> Self: + ''' + Return whether all values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().map_alias(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").keep_name()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Return the number of elements in the column. + + .. warning:: + Null values are treated like regular elements in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Return the number of elements in the column. + + Null values are treated like regular elements in this context. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, n: int = ...) -> Self: + ''' + Shift values by the given number of places. + + Parameters + ---------- + n + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.with_columns(foo_shifted=pl.col("foo").shift(1)) + shape: (4, 2) + ┌─────┬─────────────┐ + │ foo ┆ foo_shifted │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════════════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴─────────────┘ + + ''' + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + ''' + Shift values by the given number of places and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.with_columns(foo_shifted=pl.col("foo").shift_and_fill("a", n=1)) + shape: (4, 2) + ┌─────┬─────────────┐ + │ foo ┆ foo_shifted │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════════════╡ + │ 1 ┆ a │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴─────────────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def rolling(self, index_column: str) -> Self: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to ``True``, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for ``map`` functions is transforming the values + represented by an expression using a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Notes + ----- + If you are looking to map a function over a window function or group_by context, + refer to func:`map_elements` instead. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + map_elements + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type ``Callable[[Any], Any]``. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type ``Callable[[Series], Any]``. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be ``pl.Unknown``. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using ``map_elements`` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using ``over`` is considered a GroupBy context + here, so ``map_elements`` can be used to map functions over window groups. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using ``over`` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other`` where ``None == None``. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr != other`` where ``None == None``. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ NaN │ + │ 3.0 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the ``window_size - 1`` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def cot(self) -> Self: + ''' + Compute the element-wise value for the cotangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cot().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 0.64 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | IntoExprColumn | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def _register_plugin(self) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by ``lib::symbol`` + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + These arguments have to be of type Expression. + kwargs + Non-expression arguments. They must be JSON serializable. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + auto_explode + Explode the results in a group_by. + This is recommended for aggregation functions. + cast_to_supertypes + Cast the input datatypes to their supertype. + + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/lazyframe/frame deleted file mode 100644 index 6342dc5..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/lazyframe/frame +++ /dev/null @@ -1,161 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import dataframe_api_compat as dataframe_api_compat, numpy as np, subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud -from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, ColumnNameOrSelector as ColumnNameOrSelector, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IntoExprColumn as IntoExprColumn, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Label as Label, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath -from typing import Any, Awaitable, Callable, ClassVar, Collection, Concatenate, Iterable, Literal, Mapping, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: ClassVar[set[str]] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - @classmethod - def _scan_csv(cls, source: str | list[str] | list[Path], *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., schema: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str | list[str] | list[Path], *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ..., hive_partitioning: bool = ..., retries: int = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path | list[str] | list[Path], *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str | Path | list[str] | list[Path], *, infer_schema_length: int | None = ..., schema: SchemaDefinition | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, *, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def deserialize(cls, source: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def _comparison_error(self, operator: str) -> NoReturn: ... - def __eq__(self, other: Any) -> NoReturn: ... - def __ne__(self, other: Any) -> NoReturn: ... - def __gt__(self, other: Any) -> NoReturn: ... - def __lt__(self, other: Any) -> NoReturn: ... - def __ge__(self, other: Any) -> NoReturn: ... - def __le__(self, other: Any) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def serialize(self, file: None = ...) -> str: ... - @overload - def serialize(self, file: IOBase | str | Path) -> None: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | Path | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., no_optimization: bool = ..., streaming: bool = ..., _eager: bool = ...) -> DataFrame: ... - @overload - def collect_async(self, *, gevent: Literal[True], type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> _GeventDataFrameResult[DataFrame]: ... - @overload - def collect_async(self, *, gevent: Literal[False] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> Awaitable[DataFrame]: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., no_optimization: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., no_optimization: bool = ...) -> DataFrame: ... - def sink_csv(self, path: str | Path, *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote_char: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., no_optimization: bool = ...) -> DataFrame: ... - def _set_sink_optimizations(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., no_optimization: bool = ...) -> PyLazyFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool | None = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., label: Label = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other: Self | list[Self]) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, n: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, n: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: ... - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map_batches(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..ff69316 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/lazyframe/frame.pyi @@ -0,0 +1,4157 @@ +#: version 0.19.11 +import P +import np +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to ``StringIO`` + and then use ``LazyFrame.deserialize``. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to ``deserialize``. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.serialize`. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + """ + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``True``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Materialize this LazyFrame into a DataFrame. + + By default, all query optimizations are enabled. Individual optimizations may + be disabled by setting the corresponding parameter to ``False``. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + no_optimization + Turn off (certain) optimizations. + streaming + Process the query in batches to handle larger-than-memory data. + If set to ``False`` (default), the entire query is processed in a single + batch. + + .. warning:: + This functionality is currently in an alpha state. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + + Returns + ------- + DataFrame + + See Also + -------- + fetch: Run the query on the first `n` rows only for debugging purposes. + explain : Print the query plan that is evaluated with collect. + profile : Collect the LazyFrame and time each node in the computation graph. + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.Config.set_streaming_chunk_size : Set the size of streaming batches. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + Collect in streaming mode + + >>> lf.group_by("a").agg(pl.all().sum()).collect( + ... streaming=True + ... ) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + dataframe directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + ... + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a Parquet file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an IPC file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a CSV file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the + separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that ``fetch`` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if ``n_rows`` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this LazyFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") > 1).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> lf.filter( + ... pl.col("foo") == 1, + ... pl.col("ham") == "a", + ... ).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> lf.filter(foo=1, ham="a").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Setting this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_group_by`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.rolling(index_column="dt", period="2d") + ... .agg( + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ) + ... .collect() + ... ) + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, n: int = ...) -> Self: + ''' + Shift values by the given number of places. + + Parameters + ---------- + n + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift values by the given number of places and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, n=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(fill_value=0, n=-1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.approx_n_unique`. + + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on; if given ``None`` the implicit row + index is used as a join key instead. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> lf.collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_lf = pl.LazyFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> lf.update(new_lf).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> lf.update(new_lf, how="inner").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update( + ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... ).collect() + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is ``\'window\'``. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/series/series deleted file mode 100644 index 730f122..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/series/series +++ /dev/null @@ -1,367 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, IntoExprColumn as IntoExprColumn, NullBehavior as NullBehavior, NumericLiteral as NumericLiteral, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TemporalLiteral as TemporalLiteral -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.deprecation import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Generator, Literal, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: ClassVar[set[str]] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> tuple[int, int, int]: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Series: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Series: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Series: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Series: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Series: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Series: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Series: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - @overload - def __floordiv__(self, other: Expr) -> Expr: ... - @overload - def __floordiv__(self, other: Any) -> Series: ... - def __invert__(self) -> Series: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def __column_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _repr_html_(self) -> str: ... - def item(self, index: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def cbrt(self) -> Series: ... - @overload - def any(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def any(self, *, ignore_nulls: bool) -> bool | None: ... - @overload - def all(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def all(self, *, ignore_nulls: bool) -> bool | None: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | None | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: bool) -> Series | DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: bool) -> Series | DataFrame: ... - def rle(self) -> Series: ... - def rle_id(self) -> Series: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool | None = ...) -> Self: ... - def extend(self, other: Series) -> Self: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int | IntoExprColumn = ...) -> Series: ... - def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def not_(self) -> Series: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first_distinct(self) -> Series: ... - def is_last_distinct(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool | None = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series[Any]: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def cot(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, n: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, n: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int | IntoExprColumn = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: ... - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: ... - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def is_first(self) -> Series: ... - def is_last(self) -> Series: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/series/series.pyi new file mode 100644 index 0000000..925f9b7 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.11/polars/series/series.pyi @@ -0,0 +1,4752 @@ +#: version 0.19.11 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import ShapeError as ShapeError +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.deprecation import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the ``Series`` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator ``series == other`` where ``None == None``. + + This differs from the standard ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator ``series != other`` where ``None == None``. + + This differs from the standard ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With an index, this is equivalent to ``s[index]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + category_label + Name of the category column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to ``False``, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting ``include_breaks=True``, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to ``True``, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + category_label + Name of the category column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + as_series + If set to ``False``, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting ``include_breaks=True``, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴────────┘ + + Sort the output by count. + + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and ``append`` will change to always + behave like ``append_chunks=True`` (the previous default). For the + behavior of ``append_chunks=False``, use ``Series.extend``. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from ``append``, which adds the chunks from ``other`` to the chunks of + this series, ``extend`` appends the data from ``other`` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``append`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer ``append`` over ``extend`` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single ``Series``. In the latter case, finish the sequence + of ``append`` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no ``null`` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have ``null`` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be ``false``. + + To confirm that a column has ``null`` values use :func:`null_count`. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Return the number of elements in this Series. + + Null values are treated like regular elements in this context. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point ``nan`` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set ``zero_copy_only=True``. + + Alternatively, if you want a zero-copy view and know what you are doing, + use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + Series + The mutated series. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Create a copy of this Series. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def cot(self) -> Series: + ''' + Compute the element-wise value for the cotangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cot() + shape: (3,) + Series: \'a\' [f64] + [ + inf + 6.1232e-17 + -8.1656e15 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, n: int = ...) -> Series: + ''' + Shift values by the given number of places. + + Parameters + ---------- + n + Number of places to shift (may be negative). + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift values by the given number of places and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the ``window_size - 1`` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.ewm_mean(com=1) + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.666667 + 2.428571 + ] + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: + """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/dataframe/frame deleted file mode 100644 index 3e293ae..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/dataframe/frame +++ /dev/null @@ -1,302 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase, TextIOWrapper -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions import col as col, lit as lit -from polars.interchange.dataframe import PolarsDataFrame as PolarsDataFrame -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte -from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnFormatDict as ColumnFormatDict, ColumnNameOrSelector as ColumnNameOrSelector, ColumnTotalsDefinition as ColumnTotalsDefinition, ColumnWidthsDefinition as ColumnWidthsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IndexOrder as IndexOrder, IntoExpr as IntoExpr, IntoExprColumn as IntoExprColumn, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Label as Label, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SelectorType as SelectorType, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from typing import Any, BinaryIO, Callable, ClassVar, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: ClassVar[set[str]] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., schema: None | SchemaDict = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes, *, infer_schema_length: int | None = ..., schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ..., ignore_errors: bool = ...) -> Self: ... - def _replace(self, column: str, new_column: Series) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def flags(self) -> dict[str, dict[str, bool]]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, *, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state: list[Series]) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Series]: ... - def __reversed__(self) -> Iterator[Series]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ..., *, order: IndexOrder = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote_char: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path, *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote_char: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: ColumnFormatDict | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., header_format: dict[str, Any] | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: ColumnWidthsDefinition | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | SelectorType | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ..., freeze_panes: str | tuple[int, int] | tuple[str, int, int] | tuple[int, int, int, int] | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - @overload - def write_ipc_stream(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, Any] | None = ...) -> None: ... - def write_database(self, table_name: str, connection: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: str | Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: ... - @overload - def glimpse(self, *, max_items_per_column: int = ..., max_colname_length: int = ..., return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, max_items_per_column: int = ..., max_colname_length: int = ..., return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool | None = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., label: Label = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, other: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: DataFrame) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> DataFrame: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, n: int = ..., *, fill_value: IntoExpr | None = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., *, separator: str = ..., drop_first: bool = ...) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def approx_n_unique(self) -> DataFrame: ... - def approx_unique(self) -> DataFrame: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | Series | None = ..., *, fraction: float | Series | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *, named: bool = ..., include_key: bool = ..., unique: bool = ...) -> dict[Any, Iterable[Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def corr(self, **kwargs: Any) -> DataFrame: ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def shift_and_fill(self, fill_value: int | str | float, *, n: int = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/dataframe/frame.pyi new file mode 100644 index 0000000..5089d3a --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/dataframe/frame.pyi @@ -0,0 +1,6764 @@ +#: version 0.19.12 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to ``True`` will raise a ``NotImplementedError``. + allow_copy + Allow memory to be copied to perform the conversion. If set to ``False``, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars dataframe to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + ``structured`` is set to ``False`` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + separator or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname(s):str,}`` or ``{selector:str,}`` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in ``dtype_formats``. + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A ``{key:value,}`` dictionary of ``xlsxwriter`` format options to apply + to the table header row, such as ``{"bold":True, "font_color":"#702963"}``. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` or ``{selector:int,}`` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible ``xlsxwriter`` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path to which the IPC record batch data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + If you pass ``partition_cols`` here, the dataset will be written + using ``pyarrow.parquet.write_to_dataset``. + The ``partition_cols`` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, table_name: str, connection: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Some other data types are not supported but have an associated `primitive type + `__ + to which they can be cast. This affects the following data types: + + - Unsigned integers + - :class:`Datetime` types with millisecond or nanosecond precision or with + time zone information + - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a dataframe as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") > 1) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions, combined with and/or operators: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> df.filter( + ... pl.col("foo") <= 2, + ... ~pl.col("ham").is_in(["b", "c"]), + ... ) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> df.filter(foo=2, ham="b") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``group_by_dynamic`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling operation on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is ``\'window\'``. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, other: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from ``vstack`` which adds the chunks from ``other`` to the chunks of + this ``DataFrame``, ``extend`` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``vstack`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer ``vstack`` over ``extend`` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single ``DataFrame``. In the latter case, finish the sequence of + ``vstack`` operations with a ``rechunk``. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this DataFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Returns + ------- + Series + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> df.melt(id_vars="a", value_vars=cs.numeric()) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, n: int = ...) -> DataFrame: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the ``LAG`` operation in SQL when the value for ``n`` + is positive. With a negative value for ``n``, it is similar to ``LEAD``. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> df.shift() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.shift(-2) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify ``fill_value`` to fill the resulting null values. + + >>> df.shift(-2, fill_value=100) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`DataFrame.approx_n_unique`. + + """ + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using ``iter_rows`` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy ``corrcoef``. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the values in `other`. + + By default, null values in the right dataframe are ignored. Use + `ignore_nulls=False` to overwrite values in this frame with null values in other + frame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> df.update(new_df, how="inner") + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update( + ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + """ + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is ``\'window\'``. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with this value. + n + Number of places to shift (may be negative). + + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/expr/expr deleted file mode 100644 index 88007bd..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/expr/expr +++ /dev/null @@ -1,282 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, IntoExprColumn as IntoExprColumn, MapElementsStrategy as MapElementsStrategy, NullBehavior as NullBehavior, NumericLiteral as NumericLiteral, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, TemporalLiteral as TemporalLiteral, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: ClassVar[set[str]] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int | bool) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int | bool) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr | int | bool) -> Self: ... - def __rxor__(self, other: Any) -> Self: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self, *, ignore_nulls: bool = ...) -> Self: ... - def all(self, *, ignore_nulls: bool = ...) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def cbrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def keep_name(self) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def is_not(self) -> Self: ... - def not_(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: IntoExpr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int | IntoExprColumn = ...) -> Self: ... - def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def get(self, index: int | Expr) -> Self: ... - def shift(self, n: int | IntoExprColumn = ..., *, fill_value: IntoExpr | None = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def rolling(self, index_column: str, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., check_sorted: bool = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first_distinct(self) -> Self: ... - def is_last_distinct(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: ... - def qcut(self, quantiles: Sequence[float] | int, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> Self: ... - def rle(self) -> Self: ... - def rle_id(self) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | None | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int | IntoExprColumn = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def cot(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def degrees(self) -> Self: ... - def radians(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | IntoExprColumn | None = ..., *, fraction: float | IntoExprColumn | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def is_first(self) -> Self: ... - def is_last(self) -> Self: ... - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: ... - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, n: int = ...) -> Self: ... - def register_plugin(self, *, lib: str, symbol: str, args: list[IntoExpr] | None = ..., kwargs: dict[Any, Any] | None = ..., is_elementwise: bool = ..., input_wildcard_expansion: bool = ..., returns_scalar: bool = ..., cast_to_supertypes: bool = ...) -> Self: ... - def _register_plugin(self, *, lib: str, symbol: str, args: list[IntoExpr] | None = ..., kwargs: dict[Any, Any] | None = ..., is_elementwise: bool = ..., input_wildcard_expansion: bool = ..., auto_explode: bool = ..., cast_to_supertypes: bool = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def name(self) -> ExprNameNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/expr/expr.pyi new file mode 100644 index 0000000..193937d --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/expr/expr.pyi @@ -0,0 +1,8204 @@ +#: version 0.19.12 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self) -> Self: + ''' + Return whether all values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).name.keep()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns( + ... pl.all().is_not_null().name.suffix("_not_null") # nan != null + ... ) + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Return the number of elements in the column. + + .. warning:: + Null values are treated like regular elements in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Return the number of elements in the column. + + Null values are treated like regular elements in this context. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").take([2, 1])) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ one ┆ [2, 98] │ + │ two ┆ [4, 99] │ + └───────┴───────────┘ + + See Also + -------- + Expr.get : Take a single value + + ''' + def get(self, index: int | Expr) -> Self: + ''' + Return a single value by index. + + Parameters + ---------- + index + An expression that leads to a UInt32 index. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. + + Notes + ----- + This method is similar to the ``LAG`` operation in SQL when the value for ``n`` + is positive. With a negative value for ``n``, it is similar to ``LEAD``. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns(shift=pl.col("a").shift()) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴───────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.with_columns(shift=pl.col("a").shift(-2)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ null │ + │ 4 ┆ null │ + └─────┴───────┘ + + Specify ``fill_value`` to fill the resulting null values. + + >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ 100 │ + │ 4 ┆ 100 │ + └─────┴───────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().name.suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns( + ... pl.col("c").max().over("a").name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns( + ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns( + ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns( + ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def rolling(self, index_column: str) -> Self: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to ``True``, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for ``map`` functions is transforming the values + represented by an expression using a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Notes + ----- + If you are looking to map a function over a window function or group_by context, + refer to func:`map_elements` instead. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + map_elements + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type ``Callable[[Any], Any]``. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type ``Callable[[Series], Any]``. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be ``pl.Unknown``. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using ``map_elements`` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using ``over`` is considered a GroupBy context + here, so ``map_elements`` can be used to map functions over window groups. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using ``over`` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other`` where ``None == None``. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr != other`` where ``None == None``. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ NaN │ + │ 3.0 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the ``window_size - 1`` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: + ''' + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) + >>> df.with_columns(clip=pl.col("a").clip(1, 10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + Specifying only a single bound: + + >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def cot(self) -> Self: + ''' + Compute the element-wise value for the cotangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cot().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 0.64 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | IntoExprColumn | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def register_plugin(self) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by ``lib::symbol`` + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + These arguments have to be of type Expression. + kwargs + Non-expression arguments. They must be JSON serializable. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + returns_scalar + Automatically explode on unit length if it ran as final aggregation. + this is the case for aggregations like ``sum``, ``min``, ``covariance`` etc. + + cast_to_supertypes + Cast the input datatypes to their supertype. + + """ + def _register_plugin(self) -> Self: ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def name(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/lazyframe/frame deleted file mode 100644 index 16a944f..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/lazyframe/frame +++ /dev/null @@ -1,161 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import dataframe_api_compat as dataframe_api_compat, numpy as np, subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud -from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, ColumnNameOrSelector as ColumnNameOrSelector, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IntoExprColumn as IntoExprColumn, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Label as Label, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath -from typing import Any, Awaitable, Callable, ClassVar, Collection, Concatenate, Iterable, Literal, Mapping, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: ClassVar[set[str]] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - @classmethod - def _scan_csv(cls, source: str | list[str] | list[Path], *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., schema: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str | list[str] | list[Path], *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ..., hive_partitioning: bool = ..., retries: int = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path | list[str] | list[Path], *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str | Path | list[str] | list[Path], *, infer_schema_length: int | None = ..., schema: SchemaDefinition | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any, *, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def deserialize(cls, source: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def _comparison_error(self, operator: str) -> NoReturn: ... - def __eq__(self, other: Any) -> NoReturn: ... - def __ne__(self, other: Any) -> NoReturn: ... - def __gt__(self, other: Any) -> NoReturn: ... - def __lt__(self, other: Any) -> NoReturn: ... - def __ge__(self, other: Any) -> NoReturn: ... - def __le__(self, other: Any) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def serialize(self, file: None = ...) -> str: ... - @overload - def serialize(self, file: IOBase | str | Path) -> None: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | Path | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., no_optimization: bool = ..., streaming: bool = ..., _eager: bool = ...) -> DataFrame: ... - @overload - def collect_async(self, *, gevent: Literal[True], type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> _GeventDataFrameResult[DataFrame]: ... - @overload - def collect_async(self, *, gevent: Literal[False] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> Awaitable[DataFrame]: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., no_optimization: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., no_optimization: bool = ...) -> DataFrame: ... - def sink_csv(self, path: str | Path, *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote_char: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., no_optimization: bool = ...) -> DataFrame: ... - def _set_sink_optimizations(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., no_optimization: bool = ...) -> PyLazyFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool | None = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., label: Label = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other: Self | list[Self]) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, n: int | IntoExprColumn = ..., *, fill_value: IntoExpr | None = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: ... - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map_batches(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, n: int = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..6cc7053 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/lazyframe/frame.pyi @@ -0,0 +1,4162 @@ +#: version 0.19.12 +import P +import np +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to ``StringIO`` + and then use ``LazyFrame.deserialize``. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to ``deserialize``. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.serialize`. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + """ + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``True``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Materialize this LazyFrame into a DataFrame. + + By default, all query optimizations are enabled. Individual optimizations may + be disabled by setting the corresponding parameter to ``False``. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + no_optimization + Turn off (certain) optimizations. + streaming + Process the query in batches to handle larger-than-memory data. + If set to ``False`` (default), the entire query is processed in a single + batch. + + .. warning:: + This functionality is currently in an alpha state. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + + Returns + ------- + DataFrame + + See Also + -------- + fetch: Run the query on the first `n` rows only for debugging purposes. + explain : Print the query plan that is evaluated with collect. + profile : Collect the LazyFrame and time each node in the computation graph. + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.Config.set_streaming_chunk_size : Set the size of streaming batches. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + Collect in streaming mode + + >>> lf.group_by("a").agg(pl.all().sum()).collect( + ... streaming=True + ... ) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + dataframe directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + ... + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a Parquet file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an IPC file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a CSV file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the + separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that ``fetch`` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if ``n_rows`` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this LazyFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") > 1).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> lf.filter( + ... pl.col("foo") == 1, + ... pl.col("ham") == "a", + ... ).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> lf.filter(foo=1, ham="a").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Setting this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_group_by`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.rolling(index_column="dt", period="2d") + ... .agg( + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ) + ... .collect() + ... ) + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context( + ... train_lf.select(pl.all().name.suffix("_train")) + ... ).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the ``LAG`` operation in SQL when the value for ``n`` + is positive. With a negative value for ``n``, it is similar to ``LEAD``. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.shift().collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> lf.shift(-2).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify ``fill_value`` to fill the resulting null values. + + >>> lf.shift(-2, fill_value=100).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.approx_n_unique`. + + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on; if given ``None`` the implicit row + index is used as a join key instead. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> lf.collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_lf = pl.LazyFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> lf.update(new_lf).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> lf.update(new_lf, how="inner").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update( + ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... ).collect() + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is ``\'window\'``. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/series/series deleted file mode 100644 index a075e61..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/series/series +++ /dev/null @@ -1,367 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, IntoExprColumn as IntoExprColumn, NullBehavior as NullBehavior, NumericLiteral as NumericLiteral, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TemporalLiteral as TemporalLiteral -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Generator, Literal, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: ClassVar[set[str]] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> tuple[int, int, int]: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Series: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Series: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Series: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Series: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Series: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Series: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Series: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - @overload - def __floordiv__(self, other: Expr) -> Expr: ... - @overload - def __floordiv__(self, other: Any) -> Series: ... - def __invert__(self) -> Series: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def __column_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _repr_html_(self) -> str: ... - def item(self, index: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def cbrt(self) -> Series: ... - @overload - def any(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def any(self, *, ignore_nulls: bool) -> bool | None: ... - @overload - def all(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def all(self, *, ignore_nulls: bool) -> bool | None: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | None | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: bool) -> Series | DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: bool) -> Series | DataFrame: ... - def rle(self) -> Series: ... - def rle_id(self) -> Series: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool | None = ...) -> Self: ... - def extend(self, other: Series) -> Self: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int | IntoExprColumn = ...) -> Series: ... - def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def not_(self) -> Series: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first_distinct(self) -> Series: ... - def is_last_distinct(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool | None = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series[Any]: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def cot(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, n: int = ..., *, fill_value: IntoExpr | None = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int | IntoExprColumn = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def is_first(self) -> Series: ... - def is_last(self) -> Series: ... - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: ... - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, n: int = ...) -> Series: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/series/series.pyi new file mode 100644 index 0000000..cd201da --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.12/polars/series/series.pyi @@ -0,0 +1,4801 @@ +#: version 0.19.12 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import ShapeError as ShapeError +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the ``Series`` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator ``series == other`` where ``None == None``. + + This differs from the standard ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator ``series != other`` where ``None == None``. + + This differs from the standard ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With an index, this is equivalent to ``s[index]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + category_label + Name of the category column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to ``False``, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting ``include_breaks=True``, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to ``True``, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + category_label + Name of the category column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + as_series + If set to ``False``, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting ``include_breaks=True``, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴────────┘ + + Sort the output by count. + + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and ``append`` will change to always + behave like ``append_chunks=True`` (the previous default). For the + behavior of ``append_chunks=False``, use ``Series.extend``. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from ``append``, which adds the chunks from ``other`` to the chunks of + this series, ``extend`` appends the data from ``other`` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``append`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer ``append`` over ``extend`` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single ``Series``. In the latter case, finish the sequence + of ``append`` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no ``null`` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have ``null`` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be ``false``. + + To confirm that a column has ``null`` values use :func:`null_count`. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Return the number of elements in this Series. + + Null values are treated like regular elements in this context. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point ``nan`` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set ``zero_copy_only=True``. + + Alternatively, if you want a zero-copy view and know what you are doing, + use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + Series + The mutated series. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Create a copy of this Series. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def cot(self) -> Series: + ''' + Compute the element-wise value for the cotangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cot() + shape: (3,) + Series: \'a\' [f64] + [ + inf + 6.1232e-17 + -8.1656e15 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, n: int = ...) -> Series: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the ``LAG`` operation in SQL when the value for ``n`` + is positive. With a negative value for ``n``, it is similar to ``LEAD``. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> s = pl.Series([1, 2, 3, 4]) + >>> s.shift() + shape: (4,) + Series: '' [i64] + [ + null + 1 + 2 + 3 + ] + + Pass a negative value to shift in the opposite direction instead. + + >>> s.shift(-2) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + null + null + ] + + Specify ``fill_value`` to fill the resulting null values. + + >>> s.shift(-2, fill_value=100) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + 100 + 100 + ] + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the ``window_size - 1`` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: + """ + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to ``None`` (default), no lower bound is applied. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to ``None`` (default), no upper bound is applied. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> s = pl.Series([-50, 5, 50, None]) + >>> s.clip(1, 10) + shape: (4,) + Series: '' [i64] + [ + 1 + 5 + 10 + null + ] + + Specifying only a single bound: + + >>> s.clip(upper_bound=10) + shape: (4,) + Series: '' [i64] + [ + -50 + 5 + 10 + null + ] + + """ + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.ewm_mean(com=1) + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.666667 + 2.428571 + ] + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: + """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/dataframe/frame deleted file mode 100644 index d8493b8..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/dataframe/frame +++ /dev/null @@ -1,302 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase, TextIOWrapper -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions import col as col, lit as lit -from polars.interchange.dataframe import PolarsDataFrame as PolarsDataFrame -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte -from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnFormatDict as ColumnFormatDict, ColumnNameOrSelector as ColumnNameOrSelector, ColumnTotalsDefinition as ColumnTotalsDefinition, ColumnWidthsDefinition as ColumnWidthsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IndexOrder as IndexOrder, IntoExpr as IntoExpr, IntoExprColumn as IntoExprColumn, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Label as Label, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SelectorType as SelectorType, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from typing import Any, BinaryIO, Callable, ClassVar, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: ClassVar[set[str]] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., schema: None | SchemaDict = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes, *, infer_schema_length: int | None = ..., schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ..., ignore_errors: bool = ...) -> Self: ... - def _replace(self, column: str, new_column: Series) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def flags(self) -> dict[str, dict[str, bool]]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, *, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state: list[Series]) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Series]: ... - def __reversed__(self) -> Iterator[Series]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ..., *, order: IndexOrder = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, include_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote_char: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path, *, include_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote_char: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: ColumnFormatDict | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., header_format: dict[str, Any] | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: ColumnWidthsDefinition | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., include_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | SelectorType | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ..., freeze_panes: str | tuple[int, int] | tuple[str, int, int] | tuple[int, int, int, int] | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - @overload - def write_ipc_stream(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, Any] | None = ...) -> None: ... - def write_database(self, table_name: str, connection: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: str | Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: ... - @overload - def glimpse(self, *, max_items_per_column: int = ..., max_colname_length: int = ..., return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, max_items_per_column: int = ..., max_colname_length: int = ..., return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool | None = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., label: Label = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, other: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: DataFrame) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> DataFrame: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, n: int = ..., *, fill_value: IntoExpr | None = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., *, separator: str = ..., drop_first: bool = ...) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def approx_n_unique(self) -> DataFrame: ... - def approx_unique(self) -> DataFrame: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | Series | None = ..., *, fraction: float | Series | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *, named: bool = ..., include_key: bool = ..., unique: bool = ...) -> dict[Any, Iterable[Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def corr(self, **kwargs: Any) -> DataFrame: ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def shift_and_fill(self, fill_value: int | str | float, *, n: int = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/dataframe/frame.pyi new file mode 100644 index 0000000..e263569 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/dataframe/frame.pyi @@ -0,0 +1,6758 @@ +#: version 0.19.13 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use `pl.read_csv` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use `pl.read_parquet` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading `n_rows`. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use `pl.read_json` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use `pl.read_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with `NaN`. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to `True` will raise a `NotImplementedError`. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars dataframe to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to `df[0,0]`, with a check that + the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + `structured` is set to `False` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + `pyarrow`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + separator or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path or writeable file-like object to which the data will be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + name + Schema name. Defaults to empty string. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open `xlsxwriter.Workbook` object that has not been closed. + If None, writes to a `dataframe.xlsx` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of `{"key":value,}` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. + column_formats : dict + A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. + dtype_formats : dict + A `{dtype:str,}` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + `column_formats` param). It is also valid to use dtype groups such as + `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid `xlsxwriter` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all `xlsxwriter` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A `{key:value,}` dictionary of `xlsxwriter` format options to apply + to the table header row, such as `{"bold":True, "font_color":"#702963"}`. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a `{colname:funcname,}` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A `{colname:int,}` or `{selector:int,}` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a `{colname:columns,}` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or `{row_index:int,}` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that `row_index` starts at zero and will be + the header row (unless `include_header` is False). + sparklines : dict + A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an `xlsxwriter`-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + include_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible `xlsxwriter` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC data will be + written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC record batch data will + be written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to `pyarrow.parquet.write_table`. + + If you pass `partition_cols` here, the dataset will be written + using `pyarrow.parquet.write_to_dataset`. + The `partition_cols` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, table_name: str, connection: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Some other data types are not supported but have an associated `primitive type + `__ + to which they can be cast. This affects the following data types: + + - Unsigned integers + - :class:`Datetime` types with millisecond or nanosecond precision or with + time zone information + - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a dataframe as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") > 1) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions, combined with and/or operators: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> df.filter( + ... pl.col("foo") <= 2, + ... ~pl.col("ham").is_in(["b", "c"]), + ... ) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> df.filter(foo=2, ham="b") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The `GroupBy` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `group_by_dynamic` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling operation on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see `pl.StringCache()`. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: `udf(row)`. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level `apply` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level `apply` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, other: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of + this `DataFrame`, `extend` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer `vstack` over `extend` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single `DataFrame`. In the latter case, finish the sequence of + `vstack` operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this DataFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Returns + ------- + Series + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill `value`. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> df.melt(id_vars="a", value_vars=cs.numeric()) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to `None` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying `as_dict=True`. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, n: int = ...) -> DataFrame: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> df.shift() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.shift(-2) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.shift(-2, fill_value=100) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to `None` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the `DataFrame` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`DataFrame.approx_n_unique`. + + """ + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + - Int8 + Utf8 = Utf8 + - Float32 + Int64 = Float32 + - Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The `index` and `by_predicate` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using `by_predicate` it is an error condition if anything other than + one row is returned; more than one row raises `TooManyRowsReturnedError`, and + zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of `iter_rows()` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify `named=True` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use `by_predicate` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using `iter_rows` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a `DataFrame` to a `Series` of type `Struct`. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy `corrcoef` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy `corrcoef`. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the values in `other`. + + By default, null values in the right dataframe are ignored. Use + `ignore_nulls=False` to overwrite values in this frame with null values in other + frame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> df.update(new_df, how="inner") + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update( + ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with this value. + n + Number of places to shift (may be negative). + + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/expr/expr deleted file mode 100644 index 74994b2..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/expr/expr +++ /dev/null @@ -1,283 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, IntoExprColumn as IntoExprColumn, MapElementsStrategy as MapElementsStrategy, NullBehavior as NullBehavior, NumericLiteral as NumericLiteral, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, TemporalLiteral as TemporalLiteral, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: ClassVar[set[str]] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int | bool) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int | bool) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr | int | bool) -> Self: ... - def __rxor__(self, other: Any) -> Self: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self, *, ignore_nulls: bool = ...) -> Self: ... - def all(self, *, ignore_nulls: bool = ...) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def cbrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def keep_name(self) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def is_not(self) -> Self: ... - def not_(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: IntoExpr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def round_sig_figs(self, digits: int) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int | IntoExprColumn = ...) -> Self: ... - def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def get(self, index: int | Expr) -> Self: ... - def shift(self, n: int | IntoExprColumn = ..., *, fill_value: IntoExpr | None = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def rolling(self, index_column: str, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., check_sorted: bool = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first_distinct(self) -> Self: ... - def is_last_distinct(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: ... - def qcut(self, quantiles: Sequence[float] | int, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> Self: ... - def rle(self) -> Self: ... - def rle_id(self) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | None | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int | IntoExprColumn = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def cot(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def degrees(self) -> Self: ... - def radians(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | IntoExprColumn | None = ..., *, fraction: float | IntoExprColumn | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def is_first(self) -> Self: ... - def is_last(self) -> Self: ... - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: ... - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, n: int = ...) -> Self: ... - def register_plugin(self, *, lib: str, symbol: str, args: list[IntoExpr] | None = ..., kwargs: dict[Any, Any] | None = ..., is_elementwise: bool = ..., input_wildcard_expansion: bool = ..., returns_scalar: bool = ..., cast_to_supertypes: bool = ..., pass_name_to_apply: bool = ..., changes_length: bool = ...) -> Self: ... - def _register_plugin(self, *, lib: str, symbol: str, args: list[IntoExpr] | None = ..., kwargs: dict[Any, Any] | None = ..., is_elementwise: bool = ..., input_wildcard_expansion: bool = ..., auto_explode: bool = ..., cast_to_supertypes: bool = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def name(self) -> ExprNameNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/expr/expr.pyi new file mode 100644 index 0000000..b07f131 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/expr/expr.pyi @@ -0,0 +1,8217 @@ +#: version 0.19.13 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self) -> Self: + ''' + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.map`. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + keep_name + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.prefix`. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.suffix`. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.keep`. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).name.keep()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with `^` and end with `$`. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns( + ... pl.all().is_not_null().name.suffix("_not_null") # nan != null + ... ) + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Return the number of elements in the column. + + .. warning:: + Null values are treated like regular elements in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Return the number of elements in the column. + + Null values are treated like regular elements in this context. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def round_sig_figs(self, digits: int) -> Self: + ''' + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) + >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) + shape: (3, 2) + ┌─────────┬────────────────┐ + │ a ┆ round_sig_figs │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════════╪════════════════╡ + │ 0.01234 ┆ 0.012 │ + │ 3.333 ┆ 3.3 │ + │ 1234.0 ┆ 1200.0 │ + └─────────┴────────────────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").take([2, 1])) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ one ┆ [2, 98] │ + │ two ┆ [4, 99] │ + └───────┴───────────┘ + + See Also + -------- + Expr.get : Take a single value + + ''' + def get(self, index: int | Expr) -> Self: + ''' + Return a single value by index. + + Parameters + ---------- + index + An expression that leads to a UInt32 index. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns(shift=pl.col("a").shift()) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴───────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.with_columns(shift=pl.col("a").shift(-2)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ null │ + │ 4 ┆ null │ + └─────┴───────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ 100 │ + │ 4 ┆ 100 │ + └─────┴───────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().name.suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns( + ... pl.col("c").max().over("a").name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns( + ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns( + ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns( + ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def rolling(self, index_column: str) -> Self: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), + ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for `map` functions is transforming the values + represented by an expression using a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Notes + ----- + If you are looking to map a function over a window function or group_by context, + refer to func:`map_elements` instead. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + map_elements + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type `Callable[[Any], Any]`. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type `Callable[[Series], Any]`. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be `pl.Unknown`. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using `map_elements` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using `over` is considered a GroupBy context + here, so `map_elements` can be used to map functions over window groups. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using `over` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator `expr & other & ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator `expr | other | ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other` where `None == None`. + + This differs from default `eq` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator `expr >= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator `expr > other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator `expr <= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator `expr < other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator `expr != other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr != other` where `None == None`. + + This differs from default `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator `expr + other`. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator `expr // other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator `expr % other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator `expr * other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator `expr - other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator `expr / other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator `expr ** exponent`. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator `expr ^ other`. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select(pl.col("optional_members").is_in("sets").alias("contains")) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with `lit` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ NaN │ + │ 3.0 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: + ''' + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) + >>> df.with_columns(clip=pl.col("a").clip(1, 10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + Specifying only a single bound: + + >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def cot(self) -> Self: + ''' + Compute the element-wise value for the cotangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cot().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 0.64 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | IntoExprColumn | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type `pl.Categorical`. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of `pl.first()`: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + `polars.Unknown`. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def register_plugin(self) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by `lib::symbol`. + + The parameters you give dictate how polars will deal + with the function. Make sure they are correct! + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + These arguments have to be of type Expression. + kwargs + Non-expression arguments. They must be JSON serializable. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + returns_scalar + Automatically explode on unit length if it ran as final aggregation. + this is the case for aggregations like `sum`, `min`, `covariance` etc. + cast_to_supertypes + Cast the input datatypes to their supertype. + pass_name_to_apply + if set, then the `Series` passed to the function in the group_by operation + will ensure the name is set. This is an extra heap allocation per group. + changes_length + For example a `unique` or a `slice` + + """ + def _register_plugin(self) -> Self: ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def name(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/lazyframe/frame deleted file mode 100644 index 7357e8c..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/lazyframe/frame +++ /dev/null @@ -1,161 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import dataframe_api_compat as dataframe_api_compat, numpy as np, subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud -from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, ColumnNameOrSelector as ColumnNameOrSelector, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, IntoExprColumn as IntoExprColumn, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Label as Label, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath -from typing import Any, Awaitable, Callable, ClassVar, Collection, Concatenate, Iterable, Literal, Mapping, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: ClassVar[set[str]] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - @classmethod - def _scan_csv(cls, source: str | list[str] | list[Path], *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., schema: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str | list[str] | list[Path], *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ..., hive_partitioning: bool = ..., retries: int = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path | list[str] | list[Path], *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str | Path | list[str] | list[Path], *, infer_schema_length: int | None = ..., schema: SchemaDefinition | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any, *, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def deserialize(cls, source: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def _comparison_error(self, operator: str) -> NoReturn: ... - def __eq__(self, other: Any) -> NoReturn: ... - def __ne__(self, other: Any) -> NoReturn: ... - def __gt__(self, other: Any) -> NoReturn: ... - def __lt__(self, other: Any) -> NoReturn: ... - def __ge__(self, other: Any) -> NoReturn: ... - def __le__(self, other: Any) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def serialize(self, file: None = ...) -> str: ... - @overload - def serialize(self, file: IOBase | str | Path) -> None: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | Path | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., no_optimization: bool = ..., streaming: bool = ..., _eager: bool = ...) -> DataFrame: ... - @overload - def collect_async(self, *, gevent: Literal[True], type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> _GeventDataFrameResult[DataFrame]: ... - @overload - def collect_async(self, *, gevent: Literal[False] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> Awaitable[DataFrame]: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., no_optimization: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., no_optimization: bool = ...) -> DataFrame: ... - def sink_csv(self, path: str | Path, *, include_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote_char: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., no_optimization: bool = ...) -> DataFrame: ... - def _set_sink_optimizations(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., no_optimization: bool = ...) -> PyLazyFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool | None = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., label: Label = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other: Self | list[Self]) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, n: int | IntoExprColumn = ..., *, fill_value: IntoExpr | None = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: ... - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map_batches(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, n: int = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..a58f816 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/lazyframe/frame.pyi @@ -0,0 +1,4157 @@ +#: version 0.19.13 +import P +import np +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use `pl.scan_csv` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use `pl.scan_parquet` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use `pl.scan_ipc` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use `pl.scan_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to `StringIO` + and then use `LazyFrame.deserialize`. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to `deserialize`. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.serialize`. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + """ + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to `True`. + If this is set to `True` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.with_columns(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Materialize this LazyFrame into a DataFrame. + + By default, all query optimizations are enabled. Individual optimizations may + be disabled by setting the corresponding parameter to `False`. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + no_optimization + Turn off (certain) optimizations. + streaming + Process the query in batches to handle larger-than-memory data. + If set to `False` (default), the entire query is processed in a single + batch. + + .. warning:: + This functionality is currently in an alpha state. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + + Returns + ------- + DataFrame + + See Also + -------- + fetch: Run the query on the first `n` rows only for debugging purposes. + explain : Print the query plan that is evaluated with collect. + profile : Collect the LazyFrame and time each node in the computation graph. + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.Config.set_streaming_chunk_size : Set the size of streaming batches. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + Collect in streaming mode + + >>> lf.group_by("a").agg(pl.all().sum()).collect( + ... streaming=True + ... ) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + dataframe directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + ... + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a Parquet file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an IPC file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a CSV file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the + separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that `fetch` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if `n_rows` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this LazyFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") > 1).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> lf.filter( + ... pl.col("foo") == 1, + ... pl.col("ham") == "a", + ... ).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> lf.filter(foo=1, ham="a").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Setting this to `True` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `dynamic_group_by` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.rolling(index_column="dt", period="2d") + ... .agg( + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ) + ... .collect() + ... ) + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context( + ... train_lf.select(pl.all().name.suffix("_train")) + ... ).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.shift().collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> lf.shift(-2).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> lf.shift(-2, fill_value=100).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.approx_n_unique`. + + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill `value` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The `schema` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, `predicate_pushdown` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on; if given `None` the implicit row + index is used as a join key instead. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> lf.collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_lf = pl.LazyFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> lf.update(new_lf).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> lf.update(new_lf, how="inner").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update( + ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... ).collect() + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/series/series deleted file mode 100644 index 4a8f5ad..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/series/series +++ /dev/null @@ -1,368 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, IntoExprColumn as IntoExprColumn, NullBehavior as NullBehavior, NumericLiteral as NumericLiteral, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TemporalLiteral as TemporalLiteral -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Generator, Literal, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: ClassVar[set[str]] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> tuple[int, int, int]: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Series: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Series: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Series: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Series: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Series: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Series: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Series: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - @overload - def __floordiv__(self, other: Expr) -> Expr: ... - @overload - def __floordiv__(self, other: Any) -> Series: ... - def __invert__(self) -> Series: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def __column_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _repr_html_(self) -> str: ... - def item(self, index: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def cbrt(self) -> Series: ... - @overload - def any(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def any(self, *, ignore_nulls: bool) -> bool | None: ... - @overload - def all(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def all(self, *, ignore_nulls: bool) -> bool | None: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | None | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: bool) -> Series | DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: bool) -> Series | DataFrame: ... - def rle(self) -> Series: ... - def rle_id(self) -> Series: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool | None = ...) -> Self: ... - def extend(self, other: Series) -> Self: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int | IntoExprColumn = ...) -> Series: ... - def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def not_(self) -> Series: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first_distinct(self) -> Series: ... - def is_last_distinct(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool | None = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series[Any]: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str | bool | None) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def round_sig_figs(self, digits: int) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def cot(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, n: int = ..., *, fill_value: IntoExpr | None = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int | IntoExprColumn = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def is_first(self) -> Series: ... - def is_last(self) -> Series: ... - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: ... - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, n: int = ...) -> Series: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/series/series.pyi new file mode 100644 index 0000000..14a8d65 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.13/polars/series/series.pyi @@ -0,0 +1,4823 @@ +#: version 0.19.13 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import ShapeError as ShapeError +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the `Series` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series <= other`.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series < other`.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series == other`.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series == other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series != other`.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series != other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series >= other`.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series > other`.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to `s[0]`, with a check + that the shape is (1,). With an index, this is equivalent to `s[index]`. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴────────┘ + + Sort the output by count. + + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and `append` will change to always + behave like `append_chunks=True` (the previous default). For the + behavior of `append_chunks=False`, use `Series.extend`. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from `append`, which adds the chunks from `other` to the chunks of + this series, `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer `append` over `extend` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single `Series`. In the latter case, finish the sequence + of `append` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no `null` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have `null` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be `false`. + + To confirm that a column has `null` values use :func:`null_count`. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Return the number of elements in this Series. + + Null values are treated like regular elements in this context. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point `nan` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set `zero_copy_only=True`. + + Alternatively, if you want a zero-copy view and know what you are doing, + use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str | bool | None) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + Series + The mutated series. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Create a copy of this Series. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def round_sig_figs(self, digits: int) -> Series: + """ + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> s = pl.Series([0.01234, 3.333, 1234.0]) + >>> s.round_sig_figs(2) + shape: (3,) + Series: '' [f64] + [ + 0.012 + 3.3 + 1200.0 + ] + + """ + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def cot(self) -> Series: + ''' + Compute the element-wise value for the cotangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cot() + shape: (3,) + Series: \'a\' [f64] + [ + inf + 6.1232e-17 + -8.1656e15 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, n: int = ...) -> Series: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> s = pl.Series([1, 2, 3, 4]) + >>> s.shift() + shape: (4,) + Series: '' [i64] + [ + null + 1 + 2 + 3 + ] + + Pass a negative value to shift in the opposite direction instead. + + >>> s.shift(-2) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + null + null + ] + + Specify `fill_value` to fill the resulting null values. + + >>> s.shift(-2, fill_value=100) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + 100 + 100 + ] + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their std dev. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: + """ + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no lower bound is applied. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no upper bound is applied. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> s = pl.Series([-50, 5, 50, None]) + >>> s.clip(1, 10) + shape: (4,) + Series: '' [i64] + [ + 1 + 5 + 10 + null + ] + + Specifying only a single bound: + + >>> s.clip(upper_bound=10) + shape: (4,) + Series: '' [i64] + [ + -50 + 5 + 10 + null + ] + + """ + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of `pl.first()`: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.ewm_mean(com=1) + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.666667 + 2.428571 + ] + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: + """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/dataframe/frame.pyi similarity index 99% rename from polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/dataframe/frame rename to polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/dataframe/frame.pyi index 562effd..db8d5bf 100644 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/dataframe/frame +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/dataframe/frame.pyi @@ -1,3 +1,4 @@ +#: version 0.19.14 import P import deltalake import np as np @@ -36,7 +37,7 @@ _dtype_str_repr: builtin_function_or_method class DataFrame: _accessors: _ClassVar[set] = ... - columns: Incomplete + columns: list[str] def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... @classmethod def _from_pydf(cls, py_df: PyDataFrame) -> Self: @@ -2203,9 +2204,9 @@ class DataFrame: └─────┴─────┘ ''' - def equals(self, other: DataFrame) -> bool: + def frame_equal(self, other: DataFrame) -> bool: ''' - Check whether the DataFrame is equal to another DataFrame. + Check if DataFrame is equal to other. Parameters ---------- @@ -2214,10 +2215,6 @@ class DataFrame: null_equal Consider null values as equal. - See Also - -------- - assert_frame_equal - Examples -------- >>> df1 = pl.DataFrame( @@ -2234,9 +2231,9 @@ class DataFrame: ... "ham": ["c", "b", "a"], ... } ... ) - >>> df1.equals(df1) + >>> df1.frame_equal(df1) True - >>> df1.equals(df2) + >>> df1.frame_equal(df2) False ''' @@ -4170,8 +4167,8 @@ class DataFrame: -------- >>> df = pl.DataFrame( ... { - ... "a": [1.5, 2, float("nan"), 4], - ... "b": [0.5, 4, float("nan"), 13], + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], ... } ... ) >>> df.fill_nan(99) @@ -6948,20 +6945,6 @@ class DataFrame: new_column Series that will replace the column. """ - def frame_equal(self, other: DataFrame) -> bool: - """ - Check whether the DataFrame is equal to another DataFrame. - - .. deprecated:: 0.19.16 - This method has been renamed to :func:`equals`. - - Parameters - ---------- - other - DataFrame to compare with. - null_equal - Consider null values as equal. - """ @property def shape(self): ... @property diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/expr/expr.pyi new file mode 100644 index 0000000..5070a45 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/expr/expr.pyi @@ -0,0 +1,8295 @@ +#: version 0.19.14 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self) -> Self: + ''' + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.map`. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + keep_name + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.prefix`. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.suffix`. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.keep`. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).name.keep()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with `^` and end with `$`. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns( + ... pl.all().is_not_null().name.suffix("_not_null") # nan != null + ... ) + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Return the number of elements in the column. + + .. warning:: + Null values are treated like regular elements in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Return the number of elements in the column. + + Null values are treated like regular elements in this context. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + + ''' + def cum_sum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_sum().alias("cum_sum"), + ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_sum ┆ cum_sum_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 10 │ + │ 2 ┆ 3 ┆ 9 │ + │ 3 ┆ 6 ┆ 7 │ + │ 4 ┆ 10 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_sum().alias("value_cum_sum"), + ... pl.col("values") + ... .cum_sum() + ... .forward_fill() + ... .alias("value_cum_sum_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬───────────────┬──────────────────────────┐ + │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═══════════════╪══════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴───────────────┴──────────────────────────┘ + + ''' + def cum_prod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_prod().alias("cum_prod"), + ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), + ... ) + shape: (4, 3) + ┌─────┬──────────┬──────────────────┐ + │ a ┆ cum_prod ┆ cum_prod_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════════╪══════════════════╡ + │ 1 ┆ 1 ┆ 24 │ + │ 2 ┆ 2 ┆ 24 │ + │ 3 ┆ 6 ┆ 12 │ + │ 4 ┆ 24 ┆ 4 │ + └─────┴──────────┴──────────────────┘ + + ''' + def cum_min(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_min().alias("cum_min"), + ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_min ┆ cum_min_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 1 ┆ 3 │ + │ 4 ┆ 1 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + ''' + def cum_max(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_max().alias("cum_max"), + ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_max ┆ cum_max_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 2 ┆ 4 │ + │ 3 ┆ 3 ┆ 4 │ + │ 4 ┆ 4 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_max().alias("cum_max"), + ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬─────────┬────────────────────┐ + │ values ┆ cum_max ┆ cum_max_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════════╪════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴─────────┴────────────────────┘ + + ''' + def cum_count(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_count().alias("cum_count"), + ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), + ... ) + shape: (4, 3) + ┌─────┬───────────┬───────────────────┐ + │ a ┆ cum_count ┆ cum_count_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ u32 ┆ u32 │ + ╞═════╪═══════════╪═══════════════════╡ + │ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 2 ┆ 1 │ + │ 4 ┆ 3 ┆ 0 │ + └─────┴───────────┴───────────────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def round_sig_figs(self, digits: int) -> Self: + ''' + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) + >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) + shape: (3, 2) + ┌─────────┬────────────────┐ + │ a ┆ round_sig_figs │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════════╪════════════════╡ + │ 0.01234 ┆ 0.012 │ + │ 3.333 ┆ 3.3 │ + │ 1234.0 ┆ 1200.0 │ + └─────────┴────────────────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + See Also + -------- + Expr.get : Take a single value + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg( + ... pl.col("value").gather([2, 1]) + ... ) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ one ┆ [2, 98] │ + │ two ┆ [4, 99] │ + └───────┴───────────┘ + ''' + def get(self, index: int | Expr) -> Self: + ''' + Return a single value by index. + + Parameters + ---------- + index + An expression that leads to a UInt32 index. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns(shift=pl.col("a").shift()) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴───────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.with_columns(shift=pl.col("a").shift(-2)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ null │ + │ 4 ┆ null │ + └─────┴───────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ 100 │ + │ 4 ┆ 100 │ + └─────┴───────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().name.suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns( + ... pl.col("c").max().over("a").name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns( + ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns( + ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns( + ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def rolling(self, index_column: str) -> Self: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), + ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for `map` functions is transforming the values + represented by an expression using a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Notes + ----- + If you are looking to map a function over a window function or group_by context, + refer to func:`map_elements` instead. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + map_elements + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type `Callable[[Any], Any]`. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type `Callable[[Series], Any]`. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be `pl.Unknown`. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using `map_elements` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using `over` is considered a GroupBy context + here, so `map_elements` can be used to map functions over window groups. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using `over` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").gather_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator `expr & other & ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator `expr | other | ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other` where `None == None`. + + This differs from default `eq` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator `expr >= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator `expr > other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator `expr <= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator `expr < other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator `expr != other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr != other` where `None == None`. + + This differs from default `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator `expr + other`. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator `expr // other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator `expr % other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator `expr * other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator `expr - other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator `expr / other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator `expr ** exponent`. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator `expr ^ other`. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) + shape: (3, 3) + ┌───────────┬──────────────────┬──────────┐ + │ sets ┆ optional_members ┆ contains │ + │ --- ┆ --- ┆ --- │ + │ list[i64] ┆ i64 ┆ bool │ + ╞═══════════╪══════════════════╪══════════╡ + │ [1, 2, 3] ┆ 1 ┆ true │ + │ [1, 2] ┆ 2 ┆ true │ + │ [9, 10] ┆ 3 ┆ false │ + └───────────┴──────────────────┴──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with `lit` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ NaN │ + │ 3.0 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: + ''' + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) + >>> df.with_columns(clip=pl.col("a").clip(1, 10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + Specifying only a single bound: + + >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def cot(self) -> Self: + ''' + Compute the element-wise value for the cotangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cot().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 0.64 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | IntoExprColumn | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type `pl.Categorical`. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of `pl.first()`: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + `polars.Unknown`. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def register_plugin(self) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by `lib::symbol`. + + The parameters you give dictate how polars will deal + with the function. Make sure they are correct! + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + These arguments have to be of type Expression. + kwargs + Non-expression arguments. They must be JSON serializable. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + returns_scalar + Automatically explode on unit length if it ran as final aggregation. + this is the case for aggregations like `sum`, `min`, `covariance` etc. + cast_to_supertypes + Cast the input datatypes to their supertype. + pass_name_to_apply + if set, then the `Series` passed to the function in the group_by operation + will ensure the name is set. This is an extra heap allocation per group. + changes_length + For example a `unique` or a `slice` + + """ + def _register_plugin(self) -> Self: ... + def take_every(self, n: int) -> Self: + """ + Take every nth value in the Series and return as a new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + """ + def cumsum(self) -> Self: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumprod(self) -> Self: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummin(self) -> Self: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummax(self) -> Self: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumcount(self) -> Self: + """ + Get an array with the cumulative count computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_count`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def name(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..1f582e8 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/lazyframe/frame.pyi @@ -0,0 +1,4176 @@ +#: version 0.19.14 +import P +import np +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use `pl.scan_csv` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use `pl.scan_parquet` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use `pl.scan_ipc` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use `pl.scan_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to `StringIO` + and then use `LazyFrame.deserialize`. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to `deserialize`. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.serialize`. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + """ + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to `True`. + If this is set to `True` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Materialize this LazyFrame into a DataFrame. + + By default, all query optimizations are enabled. Individual optimizations may + be disabled by setting the corresponding parameter to `False`. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + no_optimization + Turn off (certain) optimizations. + streaming + Process the query in batches to handle larger-than-memory data. + If set to `False` (default), the entire query is processed in a single + batch. + + .. warning:: + This functionality is currently in an alpha state. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + + Returns + ------- + DataFrame + + See Also + -------- + fetch: Run the query on the first `n` rows only for debugging purposes. + explain : Print the query plan that is evaluated with collect. + profile : Collect the LazyFrame and time each node in the computation graph. + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.Config.set_streaming_chunk_size : Set the size of streaming batches. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + Collect in streaming mode + + >>> lf.group_by("a").agg(pl.all().sum()).collect( + ... streaming=True + ... ) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + DataFrame directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + ... + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a Parquet file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an IPC file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a CSV file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the + separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that `fetch` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if `n_rows` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this LazyFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") > 1).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> lf.filter( + ... pl.col("foo") == 1, + ... pl.col("ham") == "a", + ... ).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> lf.filter(foo=1, ham="a").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Setting this to `True` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `dynamic_group_by` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.rolling(index_column="dt", period="2d") + ... .agg( + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ) + ... .collect() + ... ) + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another DataFrame: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context( + ... train_lf.select(pl.all().name.suffix("_train")) + ... ).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the DataFrame. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the DataFrame. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), + polars will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.shift().collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> lf.shift(-2).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> lf.shift(-2, fill_value=100).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.approx_n_unique`. + + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.gather_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill `value` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the DataFrame to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this DataFrame. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The `schema` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, `predicate_pushdown` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the DataFrame at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on; if given `None` the implicit row + index is used as a join key instead. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right DataFrame will be used to update the + left DataFrame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> lf.collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_lf = pl.LazyFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> lf.update(new_lf).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> lf.update(new_lf, how="inner").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update( + ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... ).collect() + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> Self: + """ + Take every nth row in the LazyFrame and return as a new LazyFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/series/series.pyi similarity index 97% rename from polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/series/series rename to polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/series/series.pyi index 4a40006..9b8a6f9 100644 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/series/series +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/series/series.pyi @@ -1,3 +1,4 @@ +#: version 0.19.14 import np as np import pa as pa import pd as pd @@ -21,7 +22,7 @@ from polars.utils._wrap import wrap_df as wrap_df from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence TYPE_CHECKING: bool @@ -1967,7 +1968,7 @@ class Series: Examples -------- >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) >>> s.is_nan() shape: (4,) Series: \'a\' [bool] @@ -1991,7 +1992,7 @@ class Series: Examples -------- >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) >>> s.is_not_nan() shape: (4,) Series: \'a\' [bool] @@ -2184,9 +2185,9 @@ class Series: Series.str.explode : Explode a string column. """ - def equals(self, other: Series) -> bool: + def series_equal(self, other: Series) -> bool: ''' - Check whether the Series is equal to another Series. + Check if series is equal with another Series. Parameters ---------- @@ -2198,18 +2199,15 @@ class Series: Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a `pl.Int64` will return `False`. - See Also - -------- - assert_series_equal - Examples -------- - >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s = pl.Series("a", [1, 2, 3]) >>> s2 = pl.Series("b", [4, 5, 6]) - >>> s1.equals(s1) + >>> s.series_equal(s) True - >>> s1.equals(s2) + >>> s.series_equal(s2) False + ''' def len(self) -> int: ''' @@ -4201,75 +4199,77 @@ class Series: ] ''' - def replace(self, mapping: dict[Any, Any]) -> Self: + def map_dict(self, remapping: dict[Any, Any]) -> Self: ''' - Replace values according to the given mapping. - - Needs a global string cache for lazily evaluated queries on columns of - type `Categorical`. + Replace values in the Series using a remapping dictionary. Parameters ---------- - mapping - Mapping of values to their replacement. + remapping + Dictionary containing the before/after values to map. default - Value to use when the mapping does not contain the lookup value. - Defaults to keeping the original value. + Value to use when the remapping dict does not contain the lookup value. + Use `pl.first()`, to keep the original value. return_dtype Set return dtype to override automatic return dtype determination. - See Also - -------- - str.replace - Examples -------- - Replace a single value by another value. Values not in the mapping remain - unchanged. + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.replace({2: 100}) + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") shape: (4,) - Series: \'a\' [i64] + Series: \'country_name\' [str] [ - 1 - 100 - 100 - 3 + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" ] - Replace multiple values. Specify a default to set values not in the given map - to the default value. + ...or keep the original value, by making use of `pl.first()`: - >>> s = pl.Series("country_code", ["FR", "ES", "DE", None]) - >>> country_code_map = { - ... "CA": "Canada", - ... "DE": "Germany", - ... "FR": "France", - ... None: "unspecified", - ... } - >>> s.replace(country_code_map, default=None) + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") shape: (4,) - Series: \'country_code\' [str] + Series: \'country_name\' [str] [ - "France" - null - "Germany" - "unspecified" + "Türkiye" + "???" + "Japan" + "Netherlands" ] - The return type can be overridden with the `return_dtype` argument. + ...or keep the original value, by assigning the input series: - >>> s = pl.Series("a", [0, 1, 2, 3]) - >>> s.replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) + >>> s.map_dict(country_lookup, default=s).alias("country_name") shape: (4,) - Series: \'a\' [u8] + Series: \'country_name\' [str] [ - 0 - 10 - 20 - 0 + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 ] + ''' def reshape(self, dimensions: tuple[int, ...]) -> Series: ''' @@ -4924,42 +4924,6 @@ class Series: If False then an Exception is raised if nulls are present. """ - def map_dict(self, mapping: dict[Any, Any]) -> Self: - """ - Replace values in the Series using a remapping dictionary. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`replace`. The default behavior - has changed to keep any values not present in the mapping unchanged. - Pass `default=None` to keep existing behavior. - - Parameters - ---------- - mapping - Dictionary containing the before/after values to map. - default - Value to use when the remapping dict does not contain the lookup value. - Use `pl.first()`, to keep the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - """ - def series_equal(self, other: Series) -> bool: - """ - Check whether the Series is equal to another Series. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`equals`. - - Parameters - ---------- - other - Series to compare with. - null_equal - Consider null values as equal. - strict - Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a - `pl.Int64` will return `False`. - """ @property def dtype(self): ... @property diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.16/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/dataframe/frame.pyi similarity index 99% rename from polugins_type_gen/src/polugins_type_gen/_stubs/0.19.16/polars/dataframe/frame rename to polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/dataframe/frame.pyi index 562effd..46e3dde 100644 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.16/polars/dataframe/frame +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/dataframe/frame.pyi @@ -1,3 +1,4 @@ +#: version 0.19.15 import P import deltalake import np as np @@ -36,7 +37,7 @@ _dtype_str_repr: builtin_function_or_method class DataFrame: _accessors: _ClassVar[set] = ... - columns: Incomplete + columns: list[str] def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... @classmethod def _from_pydf(cls, py_df: PyDataFrame) -> Self: @@ -2203,9 +2204,9 @@ class DataFrame: └─────┴─────┘ ''' - def equals(self, other: DataFrame) -> bool: + def frame_equal(self, other: DataFrame) -> bool: ''' - Check whether the DataFrame is equal to another DataFrame. + Check if DataFrame is equal to other. Parameters ---------- @@ -2214,10 +2215,6 @@ class DataFrame: null_equal Consider null values as equal. - See Also - -------- - assert_frame_equal - Examples -------- >>> df1 = pl.DataFrame( @@ -2234,9 +2231,9 @@ class DataFrame: ... "ham": ["c", "b", "a"], ... } ... ) - >>> df1.equals(df1) + >>> df1.frame_equal(df1) True - >>> df1.equals(df2) + >>> df1.frame_equal(df2) False ''' @@ -4170,8 +4167,8 @@ class DataFrame: -------- >>> df = pl.DataFrame( ... { - ... "a": [1.5, 2, float("nan"), 4], - ... "b": [0.5, 4, float("nan"), 13], + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], ... } ... ) >>> df.fill_nan(99) @@ -6948,20 +6945,6 @@ class DataFrame: new_column Series that will replace the column. """ - def frame_equal(self, other: DataFrame) -> bool: - """ - Check whether the DataFrame is equal to another DataFrame. - - .. deprecated:: 0.19.16 - This method has been renamed to :func:`equals`. - - Parameters - ---------- - other - DataFrame to compare with. - null_equal - Consider null values as equal. - """ @property def shape(self): ... @property diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/expr/expr.pyi similarity index 97% rename from polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/expr/expr rename to polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/expr/expr.pyi index 5131d44..e6adb86 100644 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/expr/expr +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/expr/expr.pyi @@ -1,3 +1,4 @@ +#: version 0.19.15 import P import np as np import pl @@ -20,7 +21,7 @@ from polars.utils._parse_expr_input import parse_as_expression as parse_as_expre from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, warn_closed_future_change as warn_closed_future_change from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import no_default as no_default, sphinx_accessor as sphinx_accessor +from polars.utils.various import sphinx_accessor as sphinx_accessor from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence TYPE_CHECKING: bool @@ -3433,8 +3434,8 @@ class Expr: See Also -------- + map_dict map_elements - replace Examples -------- @@ -7864,118 +7865,167 @@ class Expr: caches expressions that are equal. """ - def replace(self, mapping: dict[Any, Any]) -> Self: + def map_dict(self, remapping: dict[Any, Any]) -> Self: ''' - Replace values according to the given mapping. + Replace values in column according to remapping dictionary. Needs a global string cache for lazily evaluated queries on columns of - type `Categorical`. + type `pl.Categorical`. Parameters ---------- - mapping - Mapping of values to their replacement. + remapping + Dictionary containing the before/after values to map. default - Value to use when the mapping does not contain the lookup value. - Defaults to keeping the original value. Accepts expression input. - Non-expression inputs are parsed as literals. + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use `pl.first()`, to keep the original value. return_dtype Set return dtype to override automatic return dtype determination. See Also -------- - str.replace + map Examples -------- - Replace a single value by another value. Values not in the mapping remain - unchanged. - - >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) - >>> df.with_columns(pl.col("a").replace({2: 100}).alias("replaced")) - shape: (4, 2) - ┌─────┬──────────┐ - │ a ┆ replaced │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════════╡ - │ 1 ┆ 1 │ - │ 2 ┆ 100 │ - │ 2 ┆ 100 │ - │ 3 ┆ 3 │ - └─────┴──────────┘ - - Replace multiple values. Specify a default to set values not in the given map - to the default value. - - >>> df = pl.DataFrame({"country_code": ["FR", "ES", "DE", None]}) - >>> country_code_map = { + >>> country_code_dict = { ... "CA": "Canada", ... "DE": "Germany", ... "FR": "France", - ... None: "unspecified", + ... None: "Not specified", ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + >>> df.with_columns( ... pl.col("country_code") - ... .replace(country_code_map, default=None) - ... .alias("replaced") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") ... ) - shape: (4, 2) - ┌──────────────┬─────────────┐ - │ country_code ┆ replaced │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞══════════════╪═════════════╡ - │ FR ┆ France │ - │ ES ┆ null │ - │ DE ┆ Germany │ - │ null ┆ unspecified │ - └──────────────┴─────────────┘ - - The return type can be overridden with the `return_dtype` argument. - - >>> df = df.with_row_count() - >>> df.select( - ... "row_nr", - ... pl.col("row_nr") - ... .replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) - ... .alias("replaced"), + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of `pl.first()`: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") ... ) - shape: (4, 2) - ┌────────┬──────────┐ - │ row_nr ┆ replaced │ - │ --- ┆ --- │ - │ u32 ┆ u8 │ - ╞════════╪══════════╡ - │ 0 ┆ 0 │ - │ 1 ┆ 10 │ - │ 2 ┆ 20 │ - │ 3 ┆ 0 │ - └────────┴──────────┘ - - To reference other columns as a `default` value, a struct column must be - constructed first. The first field must be the column in which values are - replaced. The other columns can be used in the default expression. + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: >>> df.with_columns( - ... pl.struct("country_code", "row_nr") - ... .replace( - ... mapping=country_code_map, + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, ... default=pl.col("row_nr").cast(pl.Utf8), ... ) - ... .alias("replaced") + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") ... ) shape: (4, 3) - ┌────────┬──────────────┬─────────────┐ - │ row_nr ┆ country_code ┆ replaced │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ str ┆ str │ - ╞════════╪══════════════╪═════════════╡ - │ 0 ┆ FR ┆ France │ - │ 1 ┆ ES ┆ 1 │ - │ 2 ┆ DE ┆ Germany │ - │ 3 ┆ null ┆ unspecified │ - └────────┴──────────────┴─────────────┘ + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + ''' def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: """ @@ -8245,27 +8295,6 @@ class Expr: reverse Reverse the operation. """ - def map_dict(self, mapping: dict[Any, Any]) -> Self: - """ - Replace values in column according to remapping dictionary. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`replace`. The default behavior - has changed to keep any values not present in the mapping unchanged. - Pass `default=None` to keep existing behavior. - - Parameters - ---------- - mapping - Dictionary containing the before/after values to map. - default - Value to use when the remapping dict does not contain the lookup value. - Accepts expression input. Non-expression inputs are parsed as literals. - Use `pl.first()`, to keep the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - - """ @property def bin(self): ... @property diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/lazyframe/frame.pyi similarity index 99% rename from polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/lazyframe/frame rename to polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/lazyframe/frame.pyi index 561f5b2..f988935 100644 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/lazyframe/frame +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/lazyframe/frame.pyi @@ -1,3 +1,4 @@ +#: version 0.19.15 import P import np import pa @@ -3105,8 +3106,8 @@ class LazyFrame: -------- >>> lf = pl.LazyFrame( ... { - ... "a": [1.5, 2, float("nan"), 4], - ... "b": [0.5, 4, float("nan"), 13], + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], ... } ... ) >>> lf.fill_nan(99).collect() diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/series/series.pyi new file mode 100644 index 0000000..d398b97 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/series/series.pyi @@ -0,0 +1,4952 @@ +#: version 0.19.15 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Object as Object, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import ShapeError as ShapeError +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the `Series` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series <= other`.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series < other`.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series == other`.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series == other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series != other`.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series != other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series >= other`.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series > other`.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + ''' + Return the Series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to `s[0]`, with a check + that the shape is (1,). With an index, this is equivalent to `s[index]`. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cum_sum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a Series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + Series has a numeric dtype). All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴────────┘ + + Sort the output by count. + + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cum_max(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cum_max() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cum_min(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cum_min() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cum_prod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_prod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cum_sum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_sum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and `append` will change to always + behave like `append_chunks=True` (the previous default). For the + behavior of `append_chunks=False`, use `Series.extend`. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from `append`, which adds the chunks from `other` to the chunks of + this series, `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer `append` over `extend` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single `Series`. In the latter case, finish the sequence + of `append` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + head + + """ + def gather_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no `null` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have `null` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be `false`. + + To confirm that a column has `null` values use :func:`null_count`. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Return the number of elements in this Series. + + Null values are treated like regular elements in this context. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point `nan` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set `zero_copy_only=True`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def _view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + + Returns + ------- + SeriesView + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s._view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str | bool | None) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def scatter(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimization (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.scatter(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Create a copy of this Series. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def round_sig_figs(self, digits: int) -> Series: + """ + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> s = pl.Series([0.01234, 3.333, 1234.0]) + >>> s.round_sig_figs(2) + shape: (3,) + Series: '' [f64] + [ + 0.012 + 3.3 + 1200.0 + ] + + """ + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def cot(self) -> Series: + ''' + Compute the element-wise value for the cotangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cot() + shape: (3,) + Series: \'a\' [f64] + [ + inf + 6.1232e-17 + -8.1656e15 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, n: int = ...) -> Series: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> s = pl.Series([1, 2, 3, 4]) + >>> s.shift() + shape: (4,) + Series: '' [i64] + [ + null + 1 + 2 + 3 + ] + + Pass a negative value to shift in the opposite direction instead. + + >>> s.shift(-2) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + null + null + ] + + Specify `fill_value` to fill the resulting null values. + + >>> s.shift(-2, fill_value=100) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + 100 + 100 + ] + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their std dev. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: + """ + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no lower bound is applied. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no upper bound is applied. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> s = pl.Series([-50, 5, 50, None]) + >>> s.clip(1, 10) + shape: (4,) + Series: '' [i64] + [ + 1 + 5 + 10 + null + ] + + Specifying only a single bound: + + >>> s.clip(upper_bound=10) + shape: (4,) + Series: '' [i64] + [ + -50 + 5 + 10 + null + ] + + """ + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of `pl.first()`: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.ewm_mean(com=1) + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.666667 + 2.428571 + ] + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() # doctest: +SKIP + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_integer()` instead. + For signed/unsigned variants, use `Series.dtype.is_signed_integer()` + or `Series.dtype.is_unsigned_integer()`. + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() # doctest: +SKIP + True + >>> s.is_integer(signed=False) # doctest: +SKIP + True + >>> s.is_integer(signed=True) # doctest: +SKIP + False + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() # doctest: +SKIP + True + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_temporal()` instead. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() # doctest: +SKIP + True + >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP + False + + """ + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Boolean` instead. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() # doctest: +SKIP + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Utf8` instead. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() # doctest: +SKIP + True + + ''' + def take_every(self, n: int) -> Series: + """ + Take every nth value in the Series and return as new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + Index location used for selection. + """ + def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + """ + Set values at the index locations. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`scatter`. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + """ + def cumsum(self) -> Series: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + reverse the operation. + + """ + def cummax(self) -> Series: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummin(self) -> Series: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cumprod(self) -> Series: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def view(self) -> SeriesView: + """ + Get a view into this Series data with a numpy array. + + .. deprecated:: 0.19.14 + This method will be removed in a future version. + + This operation doesn't clone data, but does not include missing values. + Don't use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: + """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/dataframe/frame.pyi similarity index 99% rename from polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/dataframe/frame rename to polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/dataframe/frame.pyi index 562effd..b4fefb0 100644 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/dataframe/frame +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/dataframe/frame.pyi @@ -1,3 +1,4 @@ +#: version 0.19.17 import P import deltalake import np as np @@ -36,7 +37,7 @@ _dtype_str_repr: builtin_function_or_method class DataFrame: _accessors: _ClassVar[set] = ... - columns: Incomplete + columns: list[str] def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... @classmethod def _from_pydf(cls, py_df: PyDataFrame) -> Self: @@ -6221,6 +6222,60 @@ class DataFrame: >>> [row["b"] for row in df.iter_rows(named=True)] [2, 4, 6] + ''' + def iter_columns(self) -> Iterator[Series]: + ''' + Returns an iterator over the DataFrame\'s columns. + + Notes + ----- + Consider whether you can use :func:`all` instead. + If you can, it will be more efficient. + + Returns + ------- + Iterator of Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [s.name for s in df.iter_columns()] + [\'a\', \'b\'] + + If you\'re using this to modify a dataframe\'s columns, e.g. + + >>> # Do NOT do this + >>> pl.DataFrame(column * 2 for column in df.iter_columns()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + + then consider whether you can use :func:`all` instead: + + >>> df.select(pl.all() * 2) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + ''' def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ''' diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.16/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/expr/expr.pyi similarity index 99% rename from polugins_type_gen/src/polugins_type_gen/_stubs/0.19.16/polars/expr/expr rename to polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/expr/expr.pyi index 5131d44..6078c62 100644 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.16/polars/expr/expr +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/expr/expr.pyi @@ -1,3 +1,4 @@ +#: version 0.19.17 import P import np as np import pl diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.16/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/lazyframe/frame.pyi similarity index 99% rename from polugins_type_gen/src/polugins_type_gen/_stubs/0.19.16/polars/lazyframe/frame rename to polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/lazyframe/frame.pyi index 561f5b2..b720588 100644 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.16/polars/lazyframe/frame +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/lazyframe/frame.pyi @@ -1,3 +1,4 @@ +#: version 0.19.17 import P import np import pa diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.16/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/series/series.pyi similarity index 99% rename from polugins_type_gen/src/polugins_type_gen/_stubs/0.19.16/polars/series/series rename to polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/series/series.pyi index 4a40006..1d6a5ab 100644 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.16/polars/series/series +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.17/polars/series/series.pyi @@ -1,3 +1,4 @@ +#: version 0.19.17 import np as np import pa as pa import pd as pd @@ -4761,7 +4762,7 @@ class Series: Check if this Series datatype is numeric. .. deprecated:: 0.19.13 - Use `Series.dtype.is_float()` instead. + Use `Series.dtype.is_numeric()` instead. Examples -------- diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/dataframe/frame deleted file mode 100644 index 562effd..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/dataframe/frame +++ /dev/null @@ -1,6977 +0,0 @@ -import P -import deltalake -import np as np -import pa as pa -import pd as pd -from _io import BytesIO, TextIOWrapper - -from builtins import PyDataFrame -from pathlib import Path -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes.classes import Boolean as Boolean, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 -from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.col import col as col -from polars.functions.lit import lit as lit -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte -from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors -from polars.slice import PolarsSlice as PolarsSlice -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence - -TYPE_CHECKING: bool -INTEGER_DTYPES: frozenset -N_INFER_DEFAULT: int -_PYARROW_AVAILABLE: bool -_dtype_str_repr: builtin_function_or_method - -class DataFrame: - _accessors: _ClassVar[set] = ... - columns: Incomplete - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: - """Construct Polars DataFrame from FFI PyDataFrame object.""" - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from a dictionary of sequences. - - Parameters - ---------- - data : dict of sequences - Two-dimensional data represented as a dictionary. dict must contain - Sequences. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - - """ - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from a sequence of sequences. - - Parameters - ---------- - data : Sequence of sequences - Two-dimensional data represented as a sequence of sequences. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - orient : {'col', 'row'}, default None - Whether to interpret two-dimensional data as columns or as rows. If None, - the orientation is inferred by matching the columns and data dimensions. If - this does not yield conclusive results, column orientation is used. - infer_schema_length - How many rows to scan to determine the column type. - - """ - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from a numpy ndarray. - - Parameters - ---------- - data : numpy ndarray - Two-dimensional data represented as a numpy ndarray. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - orient : {'col', 'row'}, default None - Whether to interpret two-dimensional data as columns or as rows. If None, - the orientation is inferred by matching the columns and data dimensions. If - this does not yield conclusive results, column orientation is used. - - """ - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from an Arrow table. - - This operation will be zero copy for the most part. Types that are not - supported by Polars may be cast to the closest supported type. - - Parameters - ---------- - data : arrow table, array, or sequence of sequences - Data representing an Arrow Table or Array. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - rechunk : bool, default True - Make sure that all data is in contiguous memory. - - """ - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a Polars DataFrame from a pandas DataFrame. - - Parameters - ---------- - data : pandas DataFrame - Two-dimensional data represented as a pandas DataFrame. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - rechunk : bool, default True - Make sure that all data is in contiguous memory. - nan_to_null : bool, default True - If the data contains NaN values they will be converted to null/None. - include_index : bool, default False - Load any non-default pandas indexes as columns. - - """ - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: - """ - Read a CSV file into a DataFrame. - - Use `pl.read_csv` to dispatch to this method. - - See Also - -------- - polars.io.read_csv - - """ - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: - """ - Read into a DataFrame from a parquet file. - - Use `pl.read_parquet` to dispatch to this method. - - See Also - -------- - polars.io.read_parquet - - """ - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: - """ - Read into a DataFrame from Apache Avro format. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - columns - Columns. - n_rows - Stop reading from Apache Avro file after reading `n_rows`. - - """ - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: - ''' - Read into a DataFrame from Arrow IPC file format. - - See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. - Arrow IPC files are also known as Feather (v2) files. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - columns - Columns to select. Accepts a list of column indices (starting at zero) or a - list of column names. - n_rows - Stop reading from IPC file after reading `n_rows`. - row_count_name - Row count name. - row_count_offset - Row count offset. - rechunk - Make sure that all data is contiguous. - memory_map - Memory map the file - - ''' - @classmethod - def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: - ''' - Read into a DataFrame from Arrow IPC record batch stream format. - - See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - columns - Columns to select. Accepts a list of column indices (starting at zero) or a - list of column names. - n_rows - Stop reading from IPC stream after reading `n_rows`. - row_count_name - Row count name. - row_count_offset - Row count offset. - rechunk - Make sure that all data is contiguous. - - ''' - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: - """ - Read into a DataFrame from a JSON file. - - Use `pl.read_json` to dispatch to this method. - - See Also - -------- - polars.io.read_json - - """ - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: - """ - Read into a DataFrame from a newline delimited JSON file. - - Use `pl.read_ndjson` to dispatch to this method. - - See Also - -------- - polars.io.read_ndjson - - """ - def _replace(self, column: str, new_column: Series) -> Self: - """Replace a column by a new Series (in place).""" - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: - """ - Numpy __array__ interface protocol. - - Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see - https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. - """ - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: - ''' - Convert to a dataframe object implementing the dataframe interchange protocol. - - Parameters - ---------- - nan_as_null - Overwrite null values in the data with `NaN`. - - .. warning:: - This functionality has not been implemented and the parameter will be - removed in a future version. - Setting this to `True` will raise a `NotImplementedError`. - allow_copy - Allow memory to be copied to perform the conversion. If set to `False`, - causes conversions that are not zero-copy to fail. - - Notes - ----- - Details on the Python dataframe interchange protocol: - https://data-apis.org/dataframe-protocol/latest/index.html - - Examples - -------- - Convert a Polars DataFrame to a generic dataframe object and access some - properties. - - >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) - >>> dfi = df.__dataframe__() - >>> dfi.num_rows() - 2 - >>> dfi.get_column(1).dtype - (, 64, \'g\', \'=\') - - ''' - def __dataframe_consortium_standard__(self) -> Any: - """ - Provide entry point to the Consortium DataFrame Standard API. - - This is developed and maintained outside of polars. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. - """ - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: - """Compare a DataFrame with another object.""" - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: - """Compare a DataFrame with another DataFrame.""" - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: - """Compare a DataFrame with a non-DataFrame object.""" - def _div(self, other: Any) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Series]: ... - def __reversed__(self) -> Iterator[Series]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: - """Get item. Does quite a lot. Read the comments.""" - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: - """ - Format output data in HTML for display in Jupyter Notebooks. - - Output rows and columns can be modified by setting the following ENVIRONMENT - variables: - - * POLARS_FMT_MAX_COLS: set the number of columns - * POLARS_FMT_MAX_ROWS: set the number of rows - - """ - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: - ''' - Return the DataFrame as a scalar, or return the element at the given row/column. - - Parameters - ---------- - row - Optional row index. - column - Optional column index or name. - - See Also - -------- - row: Get the values of a single row, either by index or by predicate. - - Notes - ----- - If row/col not provided, this is equivalent to `df[0,0]`, with a check that - the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> df.select((pl.col("a") * pl.col("b")).sum()).item() - 32 - >>> df.item(1, 1) - 5 - >>> df.item(2, "b") - 6 - - ''' - def to_arrow(self) -> pa.Table: - ''' - Collect the underlying arrow arrays in an Arrow Table. - - This operation is mostly zero copy. - - Data types that do copy: - - CategoricalType - - Examples - -------- - >>> df = pl.DataFrame( - ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} - ... ) - >>> df.to_arrow() - pyarrow.Table - foo: int64 - bar: large_string - ---- - foo: [[1,2,3,4,5,6]] - bar: [["a","b","c","d","e","f"]] - - ''' - def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: - ''' - Convert DataFrame to a dictionary mapping column name to values. - - Parameters - ---------- - as_series - True -> Values are Series - False -> Values are List[Any] - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4, 5], - ... "fruits": ["banana", "banana", "apple", "apple", "banana"], - ... "B": [5, 4, 3, 2, 1], - ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], - ... "optional": [28, 300, None, 2, -30], - ... } - ... ) - >>> df - shape: (5, 5) - ┌─────┬────────┬─────┬────────┬──────────┐ - │ A ┆ fruits ┆ B ┆ cars ┆ optional │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ - ╞═════╪════════╪═════╪════════╪══════════╡ - │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ - │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ - │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ - │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ - │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ - └─────┴────────┴─────┴────────┴──────────┘ - >>> df.to_dict(as_series=False) - {\'A\': [1, 2, 3, 4, 5], - \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], - \'B\': [5, 4, 3, 2, 1], - \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], - \'optional\': [28, 300, None, 2, -30]} - >>> df.to_dict(as_series=True) - {\'A\': shape: (5,) - Series: \'A\' [i64] - [ - 1 - 2 - 3 - 4 - 5 - ], \'fruits\': shape: (5,) - Series: \'fruits\' [str] - [ - "banana" - "banana" - "apple" - "apple" - "banana" - ], \'B\': shape: (5,) - Series: \'B\' [i64] - [ - 5 - 4 - 3 - 2 - 1 - ], \'cars\': shape: (5,) - Series: \'cars\' [str] - [ - "beetle" - "audi" - "beetle" - "beetle" - "beetle" - ], \'optional\': shape: (5,) - Series: \'optional\' [i64] - [ - 28 - 300 - null - 2 - -30 - ]} - - ''' - def to_dicts(self) -> list[dict[str, Any]]: - ''' - Convert every row to a dictionary of Python-native values. - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.to_dicts() - [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] - - ''' - def to_numpy(self) -> np.ndarray[Any, Any]: - ''' - Convert DataFrame to a 2D NumPy array. - - This operation clones data. - - Parameters - ---------- - structured - Optionally return a structured array, with field names and - dtypes that correspond to the DataFrame schema. - order - The index order of the returned NumPy array, either C-like or - Fortran-like. In general, using the Fortran-like index order is faster. - However, the C-like order might be more appropriate to use for downstream - applications to prevent cloning data, e.g. when reshaping into a - one-dimensional array. Note that this option only takes effect if - `structured` is set to `False` and the DataFrame dtypes allow for a - global dtype for all columns. - - Notes - ----- - If you\'re attempting to convert Utf8 to an array you\'ll need to install - `pyarrow`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.5, 7.0, 8.5], - ... "ham": ["a", "b", "c"], - ... }, - ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, - ... ) - - Export to a standard 2D numpy array. - - >>> df.to_numpy() - array([[1, 6.5, \'a\'], - [2, 7.0, \'b\'], - [3, 8.5, \'c\']], dtype=object) - - Export to a structured array, which can better-preserve individual - column data, such as name and dtype... - - >>> df.to_numpy(structured=True) - array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], - dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np - >>> df.to_numpy(structured=True).view(np.recarray) - rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], - dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: - ''' - Cast to a pandas DataFrame. - - This requires that :mod:`pandas` and :mod:`pyarrow` are installed. - This operation clones data, unless `use_pyarrow_extension_array=True`. - - Parameters - ---------- - use_pyarrow_extension_array - Use PyArrow backed-extension arrays instead of numpy arrays for each column - of the pandas DataFrame; this allows zero copy operations and preservation - of null values. Subsequent operations on the resulting pandas DataFrame may - trigger conversion to NumPy arrays if that operation is not supported by - pyarrow compute functions. - **kwargs - Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. - - Returns - ------- - :class:`pandas.DataFrame` - - Examples - -------- - >>> import pandas - >>> df1 = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> pandas_df1 = df1.to_pandas() - >>> type(pandas_df1) - - >>> pandas_df1.dtypes - foo int64 - bar int64 - ham object - dtype: object - >>> df2 = pl.DataFrame( - ... { - ... "foo": [1, 2, None], - ... "bar": [6, None, 8], - ... "ham": [None, "b", "c"], - ... } - ... ) - >>> pandas_df2 = df2.to_pandas() - >>> pandas_df2 - foo bar ham - 0 1.0 6.0 None - 1 2.0 NaN b - 2 NaN 8.0 c - >>> pandas_df2.dtypes - foo float64 - bar float64 - ham object - dtype: object - >>> pandas_df2_pa = df2.to_pandas( - ... use_pyarrow_extension_array=True - ... ) # doctest: +SKIP - >>> pandas_df2_pa # doctest: +SKIP - foo bar ham - 0 1 6 - 1 2 b - 2 8 c - >>> pandas_df2_pa.dtypes # doctest: +SKIP - foo int64[pyarrow] - bar int64[pyarrow] - ham large_string[pyarrow] - dtype: object - - ''' - def to_series(self, index: int = ...) -> Series: - ''' - Select column as Series at index location. - - Parameters - ---------- - index - Location of selection. - - See Also - -------- - get_column - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.to_series(1) - shape: (3,) - Series: \'bar\' [i64] - [ - 6 - 7 - 8 - ] - - ''' - def to_init_repr(self, n: int = ...) -> str: - ''' - Convert DataFrame to instantiatable string representation. - - Parameters - ---------- - n - Only use first n rows. - - See Also - -------- - polars.Series.to_init_repr - polars.from_repr - - Examples - -------- - >>> df = pl.DataFrame( - ... [ - ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), - ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), - ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), - ... ] - ... ) - >>> print(df.to_init_repr()) - pl.DataFrame( - [ - pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), - pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), - pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), - ] - ) - - >>> df_from_str_repr = eval(df.to_init_repr()) - >>> df_from_str_repr - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ f32 ┆ cat │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 7.0 ┆ b │ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: - ''' - Serialize to JSON representation. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - If set to `None` (default), the output is returned as a string instead. - pretty - Pretty serialize json. - row_oriented - Write to row oriented json. This is slower, but more common. - - See Also - -------- - DataFrame.write_ndjson - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... } - ... ) - >>> df.write_json() - \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' - >>> df.write_json(row_oriented=True) - \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' - - ''' - def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: - ''' - Serialize to newline delimited JSON representation. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - If set to `None` (default), the output is returned as a string instead. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... } - ... ) - >>> df.write_ndjson() - \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' - - ''' - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: - ''' - Write to comma-separated values (CSV) file. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - If set to `None` (default), the output is returned as a string instead. - include_bom - Whether to include UTF-8 BOM in the CSV output. - include_header - Whether to include header in the CSV output. - separator - Separate CSV fields with this symbol. - line_terminator - String used to end each row. - quote_char - Byte to use as quoting character. - batch_size - Number of rows that will be processed per thread. - datetime_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. If no format specified, the default fractional-second - precision is inferred from the maximum timeunit found in the frame\'s - Datetime cols (if any). - date_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - time_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - float_precision - Number of decimal places to write, applied to both `Float32` and - `Float64` datatypes. - null_value - A string representing null values (defaulting to the empty string). - quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} - Determines the quoting strategy used. - - - necessary (default): This puts quotes around fields only when necessary. - They are necessary when fields contain a quote, - separator or record terminator. - Quotes are also necessary when writing an empty record - (which is indistinguishable from a record with one empty field). - This is the default. - - always: This puts quotes around every field. Always. - - never: This never puts quotes around fields, even if that results in - invalid CSV data (e.g.: by not quoting strings containing the separator). - - non_numeric: This puts quotes around all fields that are non-numeric. - Namely, when writing a field that does not parse as a valid float - or integer, then quotes will be used even if they aren`t strictly - necessary. - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.csv" - >>> df.write_csv(path, separator=",") - - ''' - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: - ''' - Write to Apache Avro file. - - Parameters - ---------- - file - File path or writeable file-like object to which the data will be written. - compression : {\'uncompressed\', \'snappy\', \'deflate\'} - Compression method. Defaults to "uncompressed". - name - Schema name. Defaults to empty string. - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.avro" - >>> df.write_avro(path) - - ''' - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: - ''' - Write frame data to a table in an Excel workbook/worksheet. - - Parameters - ---------- - workbook : Workbook - String name or path of the workbook to create, BytesIO object to write - into, or an open `xlsxwriter.Workbook` object that has not been closed. - If None, writes to a `dataframe.xlsx` workbook in the working directory. - worksheet : str - Name of target worksheet; if None, writes to "Sheet1" when creating a new - workbook (note that writing to an existing workbook requires a valid - existing -or new- worksheet name). - position : {str, tuple} - Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. - table_style : {str, dict} - A named Excel table style, such as "Table Style Medium 4", or a dictionary - of `{"key":value,}` options containing one or more of the following keys: - "style", "first_column", "last_column", "banded_columns, "banded_rows". - table_name : str - Name of the output table object in the worksheet; can then be referred to - in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. - column_formats : dict - A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an - Excel format string to the given columns. Formats defined here (such as - "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. - dtype_formats : dict - A `{dtype:str,}` dictionary that sets the default Excel format for the - given dtype. (This can be overridden on a per-column basis by the - `column_formats` param). It is also valid to use dtype groups such as - `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform - integer and float formats. - conditional_formats : dict - A dictionary of colname (or selector) keys to a format str, dict, or list - that defines conditional formatting options for the specified columns. - - * If supplying a string typename, should be one of the valid `xlsxwriter` - types such as "3_color_scale", "data_bar", etc. - * If supplying a dictionary you can make use of any/all `xlsxwriter` - supported options, including icon sets, formulae, etc. - * Supplying multiple columns as a tuple/key will apply a single format - across all columns - this is effective in creating a heatmap, as the - min/max values will be determined across the entire range, not per-column. - * Finally, you can also supply a list made up from the above options - in order to apply *more* than one conditional format to the same range. - header_format : dict - A `{key:value,}` dictionary of `xlsxwriter` format options to apply - to the table header row, such as `{"bold":True, "font_color":"#702963"}`. - column_totals : {bool, list, dict} - Add a column-total row to the exported table. - - * If True, all numeric columns will have an associated total using "sum". - * If passing a string, it must be one of the valid total function names - and all numeric columns will have an associated total using that function. - * If passing a list of colnames, only those given will have a total. - * For more control, pass a `{colname:funcname,}` dict. - - Valid total function names are "average", "count_nums", "count", "max", - "min", "std_dev", "sum", and "var". - column_widths : {dict, int} - A `{colname:int,}` or `{selector:int,}` dict or a single integer that - sets (or overrides if autofitting) table column widths, in integer pixel - units. If given as an integer the same value is used for all table columns. - row_totals : {dict, bool} - Add a row-total column to the right-hand side of the exported table. - - * If True, a column called "total" will be added at the end of the table - that applies a "sum" function row-wise across all numeric columns. - * If passing a list/sequence of column names, only the matching columns - will participate in the sum. - * Can also pass a `{colname:columns,}` dictionary to create one or - more total columns with distinct names, referencing different columns. - row_heights : {dict, int} - An int or `{row_index:int,}` dictionary that sets the height of the given - rows (if providing a dictionary) or all rows (if providing an integer) that - intersect with the table body (including any header and total row) in - integer pixel units. Note that `row_index` starts at zero and will be - the header row (unless `include_header` is False). - sparklines : dict - A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more - sparklines to be written into a new column in the table. - - * If passing a list of colnames (used as the source of the sparkline data) - the default sparkline settings are used (eg: line chart with no markers). - * For more control an `xlsxwriter`-compliant options dict can be supplied, - in which case three additional polars-specific keys are available: - "columns", "insert_before", and "insert_after". These allow you to define - the source columns and position the sparkline(s) with respect to other - table columns. If no position directive is given, sparklines are added to - the end of the table (eg: to the far right) in the order they are given. - formulas : dict - A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or - more formulas to be written into a new column in the table. Note that you - are strongly advised to use structured references in your formulae wherever - possible to make it simple to reference columns by name. - - * If providing a string formula (such as "=[@colx]*[@coly]") the column will - be added to the end of the table (eg: to the far right), after any default - sparklines and before any row_totals. - * For the most control supply an options dictionary with the following keys: - "formula" (mandatory), one of "insert_before" or "insert_after", and - optionally "return_dtype". The latter is used to appropriately format the - output of the formula and allow it to participate in row/column totals. - float_precision : int - Default number of decimals displayed for floating point columns (note that - this is purely a formatting directive; the actual values are not rounded). - include_header : bool - Indicate if the table should be created with a header row. - autofilter : bool - If the table has headers, provide autofilter capability. - autofit : bool - Calculate individual column widths from the data. - hidden_columns : list - A list or selector representing table columns to hide in the worksheet. - hide_gridlines : bool - Do not display any gridlines on the output worksheet. - sheet_zoom : int - Set the default zoom level of the output worksheet. - freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) - Freeze workbook panes. - - * If (row, col) is supplied, panes are split at the top-left corner of the - specified cell, which are 0-indexed. Thus, to freeze only the top row, - supply (1, 0). - * Alternatively, cell notation can be used to supply the cell. For example, - "A2" indicates the split occurs at the top-left of cell A2, which is the - equivalent of (1, 0). - * If (row, col, top_row, top_col) are supplied, the panes are split based on - the `row` and `col`, and the scrolling region is inititalized to begin at - the `top_row` and `top_col`. Thus, to freeze only the top row and have the - scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). - Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. - - Notes - ----- - * A list of compatible `xlsxwriter` format property names can be found here: - https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties - - * Conditional formatting dictionaries should provide xlsxwriter-compatible - definitions; polars will take care of how they are applied on the worksheet - with respect to the relative sheet/column position. For supported options, - see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html - - * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible - key/values, as well as a mandatory polars "columns" key that defines the - sparkline source data; these source columns should all be adjacent. Two other - polars-specific keys are available to help define where the sparkline appears - in the table: "insert_after", and "insert_before". The value associated with - these keys should be the name of a column in the exported table. - https://xlsxwriter.readthedocs.io/working_with_sparklines.html - - * Formula dictionaries *must* contain a key called "formula", and then optional - "insert_after", "insert_before", and/or "return_dtype" keys. These additional - keys allow the column to be injected into the table at a specific location, - and/or to define the return type of the formula (eg: "Int64", "Float64", etc). - Formulas that refer to table columns should use Excel\'s structured references - syntax to ensure the formula is applied correctly and is table-relative. - https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e - - Examples - -------- - Instantiate a basic DataFrame: - - >>> from random import uniform - >>> from datetime import date - >>> - >>> df = pl.DataFrame( - ... { - ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], - ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], - ... "val": [10_000, 20_000, 30_000], - ... } - ... ) - - Export to "dataframe.xlsx" (the default workbook name, if not specified) in the - working directory, add column totals ("sum" by default) on all numeric columns, - then autofit: - - >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP - - Write frame to a specific location on the sheet, set a named table style, - apply US-style date formatting, increase default float precision, apply a - non-default total function to a single column, autofit: - - >>> df.write_excel( # doctest: +SKIP - ... position="B4", - ... table_style="Table Style Light 16", - ... dtype_formats={pl.Date: "mm/dd/yyyy"}, - ... column_totals={"num": "average"}, - ... float_precision=6, - ... autofit=True, - ... ) - - Write the same frame to a named worksheet twice, applying different styles - and conditional formatting to each table, adding table titles using explicit - xlsxwriter integration: - - >>> from xlsxwriter import Workbook - >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP - ... # basic/default conditional formatting - ... df.write_excel( - ... workbook=wb, - ... worksheet="data", - ... position=(3, 1), # specify position as (row,col) coordinates - ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, - ... table_style="Table Style Medium 4", - ... ) - ... - ... # advanced conditional formatting, custom styles - ... df.write_excel( - ... workbook=wb, - ... worksheet="data", - ... position=(len(df) + 7, 1), - ... table_style={ - ... "style": "Table Style Light 4", - ... "first_column": True, - ... }, - ... conditional_formats={ - ... "num": { - ... "type": "3_color_scale", - ... "min_color": "#76933c", - ... "mid_color": "#c4d79b", - ... "max_color": "#ebf1de", - ... }, - ... "val": { - ... "type": "data_bar", - ... "data_bar_2010": True, - ... "bar_color": "#9bbb59", - ... "bar_negative_color_same": True, - ... "bar_negative_border_color_same": True, - ... }, - ... }, - ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, - ... column_widths={"val": 125}, - ... autofit=True, - ... ) - ... - ... # add some table titles (with a custom format) - ... ws = wb.get_worksheet_by_name("data") - ... fmt_title = wb.add_format( - ... { - ... "font_color": "#4f6228", - ... "font_size": 12, - ... "italic": True, - ... "bold": True, - ... } - ... ) - ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) - ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) - ... - - Export a table containing two different types of sparklines. Use default - options for the "trend" sparkline and customised options (and positioning) - for the "+/-" win_loss sparkline, with non-default integer dtype formatting, - column totals, a subtle two-tone heatmap and hidden worksheet gridlines: - - >>> df = pl.DataFrame( - ... { - ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], - ... "q1": [100, 55, -20, 0, 35], - ... "q2": [30, -10, 15, 60, 20], - ... "q3": [-50, 0, 40, 80, 80], - ... "q4": [75, 55, 25, -10, -55], - ... } - ... ) - >>> df.write_excel( # doctest: +SKIP - ... table_style="Table Style Light 2", - ... # apply accounting format to all flavours of integer - ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, - ... sparklines={ - ... # default options; just provide source cols - ... "trend": ["q1", "q2", "q3", "q4"], - ... # customised sparkline type, with positioning directive - ... "+/-": { - ... "columns": ["q1", "q2", "q3", "q4"], - ... "insert_after": "id", - ... "type": "win_loss", - ... }, - ... }, - ... conditional_formats={ - ... # create a unified multi-column heatmap - ... ("q1", "q2", "q3", "q4"): { - ... "type": "2_color_scale", - ... "min_color": "#95b3d7", - ... "max_color": "#ffffff", - ... }, - ... }, - ... column_totals=["q1", "q2", "q3", "q4"], - ... row_totals=True, - ... hide_gridlines=True, - ... ) - - Export a table containing an Excel formula-based column that calculates a - standardised Z-score, showing use of structured references in conjunction - with positioning directives, column totals, and custom formatting. - - >>> df = pl.DataFrame( - ... { - ... "id": ["a123", "b345", "c567", "d789", "e101"], - ... "points": [99, 45, 50, 85, 35], - ... } - ... ) - >>> df.write_excel( # doctest: +SKIP - ... table_style={ - ... "style": "Table Style Medium 15", - ... "first_column": True, - ... }, - ... column_formats={ - ... "id": {"font": "Consolas"}, - ... "points": {"align": "center"}, - ... "z-score": {"align": "center"}, - ... }, - ... column_totals="average", - ... formulas={ - ... "z-score": { - ... # use structured references to refer to the table columns and \'totals\' row - ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", - ... "insert_after": "points", - ... "return_dtype": pl.Float64, - ... } - ... }, - ... hide_gridlines=True, - ... sheet_zoom=125, - ... ) - - ''' - def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: - ''' - Write to Arrow IPC binary stream or Feather file. - - See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. - - Parameters - ---------- - file - Path or writeable file-like object to which the IPC data will be - written. If set to `None`, the output is returned as a BytesIO object. - compression : {\'uncompressed\', \'lz4\', \'zstd\'} - Compression method. Defaults to "uncompressed". - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.arrow" - >>> df.write_ipc(path) - - ''' - def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: - ''' - Write to Arrow IPC record batch stream. - - See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. - - Parameters - ---------- - file - Path or writeable file-like object to which the IPC record batch data will - be written. If set to `None`, the output is returned as a BytesIO object. - compression : {\'uncompressed\', \'lz4\', \'zstd\'} - Compression method. Defaults to "uncompressed". - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.arrow" - >>> df.write_ipc_stream(path) - - ''' - def write_parquet(self, file: str | Path | BytesIO) -> None: - ''' - Write to Apache Parquet file. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} - Choose "zstd" for good compression performance. - Choose "lz4" for fast compression/decompression. - Choose "snappy" for more backwards compatibility guarantees - when you deal with older parquet readers. - compression_level - The level of compression to use. Higher compression means smaller files on - disk. - - - "gzip" : min-level: 0, max-level: 10. - - "brotli" : min-level: 0, max-level: 11. - - "zstd" : min-level: 1, max-level: 22. - - statistics - Write statistics to the parquet headers. This requires extra compute. - row_group_size - Size of the row groups in number of rows. Defaults to 512^2 rows. - use_pyarrow - Use C++ parquet implementation vs Rust parquet implementation. - At the moment C++ supports more features. - pyarrow_options - Arguments passed to `pyarrow.parquet.write_table`. - - If you pass `partition_cols` here, the dataset will be written - using `pyarrow.parquet.write_to_dataset`. - The `partition_cols` parameter leads to write the dataset to a directory. - Similar to Spark\'s partitioned datasets. - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.parquet" - >>> df.write_parquet(path) - - We can use pyarrow with use_pyarrow_write_to_dataset=True - to write partitioned datasets. The following example will - write the first row to ../watermark=1/*.parquet and the - other rows to ../watermark=2/*.parquet. - - >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) - >>> path: pathlib.Path = dirpath / "partitioned_object" - >>> df.write_parquet( - ... path, - ... use_pyarrow=True, - ... pyarrow_options={"partition_cols": ["watermark"]}, - ... ) - - ''' - def write_database(self, table_name: str, connection: str) -> None: - ''' - Write a polars frame to a database. - - Parameters - ---------- - table_name - Name of the table to create or append to in the target SQL database. - If your table name contains special characters, it should be quoted. - connection - Connection URI string, for example: - - * "postgresql://user:pass@server:port/database" - * "sqlite:////path/to/database.db" - if_exists : {\'append\', \'replace\', \'fail\'} - The insert mode. - \'replace\' will create a new database table, overwriting an existing one. - \'append\' will append to an existing table. - \'fail\' will fail if table already exists. - engine : {\'sqlalchemy\', \'adbc\'} - Select the engine used for writing the data. - ''' - def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: - ''' - Write DataFrame as delta table. - - Parameters - ---------- - target - URI of a table or a DeltaTable object. - mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} - How to handle existing data. - - * If \'error\', throw an error if the table already exists (default). - * If \'append\', will add new data. - * If \'overwrite\', will replace table with new data. - * If \'ignore\', will not write anything if table already exists. - overwrite_schema - If True, allows updating the schema of the table. - storage_options - Extra options for the storage backends supported by `deltalake`. - For cloud storages, this may include configurations for authentication etc. - - * See a list of supported storage options for S3 `here `__. - * See a list of supported storage options for GCS `here `__. - * See a list of supported storage options for Azure `here `__. - delta_write_options - Additional keyword arguments while writing a Delta lake Table. - See a list of supported write options `here `__. - - Raises - ------ - TypeError - If the DataFrame contains unsupported data types. - ArrowInvalidError - If the DataFrame contains data types that could not be cast to their - primitive type. - - Notes - ----- - The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` - are not supported by the delta protocol specification and will raise a - TypeError. - - Some other data types are not supported but have an associated `primitive type - `__ - to which they can be cast. This affects the following data types: - - - Unsigned integers - - :class:`Datetime` types with millisecond or nanosecond precision or with - time zone information - - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) - - Polars columns are always nullable. To write data to a delta table with - non-nullable columns, a custom pyarrow schema has to be passed to the - `delta_write_options`. See the last example below. - - Examples - -------- - Write a dataframe to the local filesystem as a Delta Lake table. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> table_path = "/path/to/delta-table/" - >>> df.write_delta(table_path) # doctest: +SKIP - - Append data to an existing Delta Lake table on the local filesystem. - Note that this will fail if the schema of the new data does not match the - schema of the existing table. - - >>> df.write_delta(table_path, mode="append") # doctest: +SKIP - - Overwrite a Delta Lake table as a new version. - If the schemas of the new and old data are the same, setting - `overwrite_schema` is not required. - - >>> existing_table_path = "/path/to/delta-table/" - >>> df.write_delta( - ... existing_table_path, mode="overwrite", overwrite_schema=True - ... ) # doctest: +SKIP - - Write a dataframe as a Delta Lake table to a cloud object store like S3. - - >>> table_path = "s3://bucket/prefix/to/delta-table/" - >>> df.write_delta( - ... table_path, - ... storage_options={ - ... "AWS_REGION": "THE_AWS_REGION", - ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", - ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", - ... }, - ... ) # doctest: +SKIP - - Write DataFrame as a Delta Lake table with non-nullable columns. - - >>> import pyarrow as pa - >>> existing_table_path = "/path/to/delta-table/" - >>> df.write_delta( - ... existing_table_path, - ... delta_write_options={ - ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) - ... }, - ... ) # doctest: +SKIP - - ''' - def estimated_size(self, unit: SizeUnit = ...) -> int | float: - ''' - Return an estimation of the total (heap) allocated size of the `DataFrame`. - - Estimated size is given in the specified unit (bytes by default). - - This estimation is the sum of the size of its buffers, validity, including - nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the - size of 2 arrays is not the sum of the sizes computed from this function. In - particular, [`StructArray`]\'s size is an upper bound. - - When an array is sliced, its allocated size remains constant because the buffer - unchanged. However, this function will yield a smaller number. This is because - this function returns the visible size of the buffer, not its total capacity. - - FFI buffers are included in this estimation. - - Parameters - ---------- - unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} - Scale the returned size to the given unit. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "x": list(reversed(range(1_000_000))), - ... "y": [v / 1000 for v in range(1_000_000)], - ... "z": [str(v) for v in range(1_000_000)], - ... }, - ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], - ... ) - >>> df.estimated_size() - 25888898 - >>> df.estimated_size("mb") - 24.689577102661133 - - ''' - def transpose(self) -> Self: - ''' - Transpose a DataFrame over the diagonal. - - Parameters - ---------- - include_header - If set, the column names will be added as first column. - header_name - If `include_header` is set, this determines the name of the column that will - be inserted. - column_names - Optional iterable yielding strings or a string naming an existing column. - These will name the value (non-header) columns in the transposed data. - - Notes - ----- - This is a very expensive operation. Perhaps you can do it differently. - - Returns - ------- - DataFrame - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) - >>> df.transpose(include_header=True) - shape: (2, 4) - ┌────────┬──────────┬──────────┬──────────┐ - │ column ┆ column_0 ┆ column_1 ┆ column_2 │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞════════╪══════════╪══════════╪══════════╡ - │ a ┆ 1 ┆ 2 ┆ 3 │ - │ b ┆ 1 ┆ 2 ┆ 3 │ - └────────┴──────────┴──────────┴──────────┘ - - Replace the auto-generated column names with a list - - >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 2 ┆ 3 │ - │ 1 ┆ 2 ┆ 3 │ - └─────┴─────┴─────┘ - - Include the header as a separate column - - >>> df.transpose( - ... include_header=True, header_name="foo", column_names=["a", "b", "c"] - ... ) - shape: (2, 4) - ┌─────┬─────┬─────┬─────┐ - │ foo ┆ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═════╡ - │ a ┆ 1 ┆ 2 ┆ 3 │ - │ b ┆ 1 ┆ 2 ┆ 3 │ - └─────┴─────┴─────┴─────┘ - - Replace the auto-generated column with column names from a generator function - - >>> def name_generator(): - ... base_name = "my_column_" - ... count = 0 - ... while True: - ... yield f"{base_name}{count}" - ... count += 1 - ... - >>> df.transpose(include_header=False, column_names=name_generator()) - shape: (2, 3) - ┌─────────────┬─────────────┬─────────────┐ - │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════════════╪═════════════╪═════════════╡ - │ 1 ┆ 2 ┆ 3 │ - │ 1 ┆ 2 ┆ 3 │ - └─────────────┴─────────────┴─────────────┘ - - Use an existing column as the new column names - - >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) - >>> df.transpose(column_names="id") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 3 ┆ 2 │ - │ 3 ┆ 4 ┆ 6 │ - └─────┴─────┴─────┘ - >>> df.transpose(include_header=True, header_name="new_id", column_names="id") - shape: (2, 4) - ┌────────┬─────┬─────┬─────┐ - │ new_id ┆ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞════════╪═════╪═════╪═════╡ - │ col1 ┆ 1 ┆ 3 ┆ 2 │ - │ col2 ┆ 3 ┆ 4 ┆ 6 │ - └────────┴─────┴─────┴─────┘ - ''' - def reverse(self) -> DataFrame: - ''' - Reverse the DataFrame. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "key": ["a", "b", "c"], - ... "val": [1, 2, 3], - ... } - ... ) - >>> df.reverse() - shape: (3, 2) - ┌─────┬─────┐ - │ key ┆ val │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ c ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 1 │ - └─────┴─────┘ - - ''' - def rename(self, mapping: dict[str, str]) -> DataFrame: - ''' - Rename column names. - - Parameters - ---------- - mapping - Key value pairs that map from old name to new name. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} - ... ) - >>> df.rename({"foo": "apple"}) - shape: (3, 3) - ┌───────┬─────┬─────┐ - │ apple ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═══════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └───────┴─────┴─────┘ - - ''' - def insert_column(self, index: int, column: Series) -> Self: - ''' - Insert a Series at a certain column index. - - This operation is in place. - - Parameters - ---------- - index - Index at which to insert the new `Series` column. - column - `Series` to insert. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> s = pl.Series("baz", [97, 98, 99]) - >>> df.insert_column(1, s) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ baz ┆ bar │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 97 ┆ 4 │ - │ 2 ┆ 98 ┆ 5 │ - │ 3 ┆ 99 ┆ 6 │ - └─────┴─────┴─────┘ - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) - >>> df.insert_column(3, s) - shape: (4, 4) - ┌─────┬──────┬───────┬──────┐ - │ a ┆ b ┆ c ┆ d │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 │ - ╞═════╪══════╪═══════╪══════╡ - │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ - │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ - │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ - │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ - └─────┴──────┴───────┴──────┘ - - ''' - def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: - ''' - Filter the rows in the DataFrame based on a predicate expression. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - predicates - Expression that evaluates to a boolean Series. - constraints - Column filters. Use name=value to filter column name by the supplied value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - - Filter on one condition: - - >>> df.filter(pl.col("foo") > 1) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Filter on multiple conditions, combined with and/or operators: - - >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Provide multiple filters using `*args` syntax: - - >>> df.filter( - ... pl.col("foo") <= 2, - ... ~pl.col("ham").is_in(["b", "c"]), - ... ) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Provide multiple filters using `**kwargs` syntax: - - >>> df.filter(foo=2, ham="b") - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - └─────┴─────┴─────┘ - - ''' - def glimpse(self) -> str | None: - ''' - Return a dense preview of the DataFrame. - - The formatting shows one line per column so that wide dataframes display - cleanly. Each line shows the column name, the data type, and the first - few values. - - Parameters - ---------- - max_items_per_column - Maximum number of items to show per column. - max_colname_length - Maximum length of the displayed column names; values that exceed this - value are truncated with a trailing ellipsis. - return_as_string - If True, return the preview as a string instead of printing to stdout. - - See Also - -------- - describe, head, tail - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... { - ... "a": [1.0, 2.8, 3.0], - ... "b": [4, 5, None], - ... "c": [True, False, True], - ... "d": [None, "b", "c"], - ... "e": ["usd", "eur", None], - ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], - ... } - ... ) - >>> df.glimpse() - Rows: 3 - Columns: 6 - $ a 1.0, 2.8, 3.0 - $ b 4, 5, None - $ c True, False, True - $ d None, \'b\', \'c\' - $ e \'usd\', \'eur\', None - $ f 2020-01-01, 2021-01-02, 2022-01-01 - - ''' - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: - ''' - Summary statistics for a DataFrame. - - Parameters - ---------- - percentiles - One or more percentiles to include in the summary statistics. - All values must be in the range `[0, 1]`. - - Notes - ----- - The median is included by default as the 50% percentile. - - See Also - -------- - glimpse - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... { - ... "a": [1.0, 2.8, 3.0], - ... "b": [4, 5, None], - ... "c": [True, False, True], - ... "d": [None, "b", "c"], - ... "e": ["usd", "eur", None], - ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], - ... } - ... ) - >>> df.describe() - shape: (9, 7) - ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ - │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ - ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ - │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ - │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ - │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ - │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ - │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ - │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ - │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ - │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ - │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ - └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ - - ''' - def get_column_index(self, name: str) -> int: - ''' - Find the index of a column by name. - - Parameters - ---------- - name - Name of the column to find. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} - ... ) - >>> df.get_column_index("ham") - 2 - - ''' - def replace_column(self, index: int, column: Series) -> Self: - ''' - Replace a column at an index location. - - This operation is in place. - - Parameters - ---------- - index - Column index. - column - Series that will replace the column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> s = pl.Series("apple", [10, 20, 30]) - >>> df.replace_column(0, s) - shape: (3, 3) - ┌───────┬─────┬─────┐ - │ apple ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═══════╪═════╪═════╡ - │ 10 ┆ 6 ┆ a │ - │ 20 ┆ 7 ┆ b │ - │ 30 ┆ 8 ┆ c │ - └───────┴─────┴─────┘ - ''' - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: - ''' - Sort the dataframe by the given columns. - - Parameters - ---------- - by - Column(s) to sort by. Accepts expression input. Strings are parsed as column - names. - *more_by - Additional columns to sort by, specified as positional arguments. - descending - Sort in descending order. When sorting by multiple columns, can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - - Examples - -------- - Pass a single column name to sort by that column. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [6.0, 5.0, 4.0], - ... "c": ["a", "c", "b"], - ... } - ... ) - >>> df.sort("a") - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - Sorting by expressions is also supported. - - >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - └──────┴─────┴─────┘ - - Sort by multiple columns by passing a list of columns. - - >>> df.sort(["c", "a"], descending=True) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - └──────┴─────┴─────┘ - - Or use positional arguments to sort by multiple columns in the same way. - - >>> df.sort("c", "a", descending=[False, True]) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - ''' - def top_k(self, k: int) -> DataFrame: - ''' - Return the `k` largest elements. - - If \'descending=True` the smallest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - See Also - -------- - bottom_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 largest values in column b. - - >>> df.top_k(4, by="b") - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ a ┆ 2 │ - │ b ┆ 2 │ - │ b ┆ 1 │ - └─────┴─────┘ - - Get the rows which contain the 4 largest values when sorting on column b and a. - - >>> df.top_k(4, by=["b", "a"]) - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 2 │ - │ c ┆ 1 │ - └─────┴─────┘ - - ''' - def bottom_k(self, k: int) -> DataFrame: - ''' - Return the `k` smallest elements. - - If \'descending=True` the largest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - See Also - -------- - top_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 smallest values in column b. - - >>> df.bottom_k(4, by="b") - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 1 │ - │ a ┆ 1 │ - │ c ┆ 1 │ - │ a ┆ 2 │ - └─────┴─────┘ - - Get the rows which contain the 4 smallest values when sorting on column a and b. - - >>> df.bottom_k(4, by=["a", "b"]) - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ b ┆ 1 │ - │ b ┆ 2 │ - └─────┴─────┘ - - ''' - def equals(self, other: DataFrame) -> bool: - ''' - Check whether the DataFrame is equal to another DataFrame. - - Parameters - ---------- - other - DataFrame to compare with. - null_equal - Consider null values as equal. - - See Also - -------- - assert_frame_equal - - Examples - -------- - >>> df1 = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df2 = pl.DataFrame( - ... { - ... "foo": [3, 2, 1], - ... "bar": [8.0, 7.0, 6.0], - ... "ham": ["c", "b", "a"], - ... } - ... ) - >>> df1.equals(df1) - True - >>> df1.equals(df2) - False - - ''' - def replace(self, column: str, new_column: Series) -> Self: - ''' - Replace a column by a new Series. - - Parameters - ---------- - column - Column to replace. - new_column - New column to insert. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> s = pl.Series([10, 20, 30]) - >>> df.replace("foo", s) # works in-place! # doctest: +SKIP - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 10 ┆ 4 │ - │ 20 ┆ 5 │ - │ 30 ┆ 6 │ - └─────┴─────┘ - - ''' - def slice(self, offset: int, length: int | None = ...) -> Self: - ''' - Get a slice of this DataFrame. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.slice(1, 2) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7.0 ┆ b │ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def head(self, n: int = ...) -> Self: - ''' - Get the first `n` rows. - - Parameters - ---------- - n - Number of rows to return. If a negative value is passed, return all rows - except the last `abs(n)`. - - See Also - -------- - tail, glimpse, slice - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> df.head(3) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Pass a negative value to get all rows `except` the last `abs(n)`. - - >>> df.head(-3) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - └─────┴─────┴─────┘ - - ''' - def tail(self, n: int = ...) -> Self: - ''' - Get the last `n` rows. - - Parameters - ---------- - n - Number of rows to return. If a negative value is passed, return all rows - except the first `abs(n)`. - - See Also - -------- - head, slice - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> df.tail(3) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8 ┆ c │ - │ 4 ┆ 9 ┆ d │ - │ 5 ┆ 10 ┆ e │ - └─────┴─────┴─────┘ - - Pass a negative value to get all rows `except` the first `abs(n)`. - - >>> df.tail(-3) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 4 ┆ 9 ┆ d │ - │ 5 ┆ 10 ┆ e │ - └─────┴─────┴─────┘ - - ''' - def limit(self, n: int = ...) -> Self: - """ - Get the first `n` rows. - - Alias for :func:`DataFrame.head`. - - Parameters - ---------- - n - Number of rows to return. If a negative value is passed, return all rows - except the last `abs(n)`. - - See Also - -------- - head - - """ - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: - ''' - Drop all rows that contain null values. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - subset - Column name(s) for which null values are considered. - If set to `None` (default), use all columns. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, None, 8], - ... "ham": ["a", "b", None], - ... } - ... ) - - The default behavior of this method is to drop rows where any single - value of the row is null. - - >>> df.drop_nulls() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - This behaviour can be constrained to consider only a subset of columns, as - defined by name or with a selector. For example, dropping rows if there is - a null in any of the integer columns: - - >>> import polars.selectors as cs - >>> df.drop_nulls(subset=cs.integer()) - shape: (2, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ null │ - └─────┴─────┴──────┘ - - Below are some additional examples that show how to drop null - values based on other conditions. - - >>> df = pl.DataFrame( - ... { - ... "a": [None, None, None, None], - ... "b": [1, 2, None, 1], - ... "c": [1, None, None, 1], - ... } - ... ) - >>> df - shape: (4, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪══════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ null ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴──────┴──────┘ - - Drop a row only if all values are null: - - >>> df.filter(~pl.all_horizontal(pl.all().is_null())) - shape: (3, 3) - ┌──────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪═════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴─────┴──────┘ - - Drop a column if all values are null: - - >>> df[[s.name for s in df if not (s.null_count() == df.height)]] - shape: (4, 2) - ┌──────┬──────┐ - │ b ┆ c │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ 1 ┆ 1 │ - │ 2 ┆ null │ - │ null ┆ null │ - │ 1 ┆ 1 │ - └──────┴──────┘ - - ''' - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: - ''' - Offers a structured way to apply a sequence of user-defined functions (UDFs). - - Parameters - ---------- - function - Callable; will receive the frame as the first parameter, - followed by any given args/kwargs. - *args - Arguments to pass to the UDF. - **kwargs - Keyword arguments to pass to the UDF. - - Notes - ----- - It is recommended to use LazyFrame when piping operations, in order - to fully take advantage of query optimization and parallelization. - See :meth:`df.lazy() `. - - Examples - -------- - >>> def cast_str_to_int(data, col_name): - ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) - ... - >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) - >>> df.pipe(cast_str_to_int, col_name="b") - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 10 │ - │ 2 ┆ 20 │ - │ 3 ┆ 30 │ - │ 4 ┆ 40 │ - └─────┴─────┘ - - >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) - >>> df - shape: (2, 2) - ┌─────┬─────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - └─────┴─────┘ - >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 1 │ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: - ''' - Add a column at index 0 that counts the rows. - - Parameters - ---------- - name - Name of the column to add. - offset - Start the row count at this offset. Default = 0 - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> df.with_row_count() - shape: (3, 3) - ┌────────┬─────┬─────┐ - │ row_nr ┆ a ┆ b │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ i64 ┆ i64 │ - ╞════════╪═════╪═════╡ - │ 0 ┆ 1 ┆ 2 │ - │ 1 ┆ 3 ┆ 4 │ - │ 2 ┆ 5 ┆ 6 │ - └────────┴─────┴─────┘ - - ''' - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: - ''' - Start a group by operation. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - .. note:: - Within each group, the order of rows is always preserved, regardless - of this argument. - - Returns - ------- - GroupBy - Object which can be used to perform aggregations. - - Examples - -------- - Group by one column and call `agg` to compute the grouped sum of another - column. - - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "c"], - ... "b": [1, 2, 1, 3, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 2 │ - │ b ┆ 5 │ - │ c ┆ 3 │ - └─────┴─────┘ - - Set `maintain_order=True` to ensure the order of the groups is consistent with - the input. - - >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) - shape: (3, 2) - ┌─────┬───────────┐ - │ a ┆ c │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════╪═══════════╡ - │ a ┆ [5, 3] │ - │ b ┆ [4, 2] │ - │ c ┆ [1] │ - └─────┴───────────┘ - - Group by multiple columns by passing a list of column names. - - >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - - Or use positional arguments to group by multiple columns in the same way. - Expressions are also accepted. - - >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ f64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 0 ┆ 4.0 │ - │ b ┆ 1 ┆ 3.0 │ - │ c ┆ 1 ┆ 1.0 │ - └─────┴─────┴─────┘ - - The `GroupBy` object returned by this method is iterable, returning the name - and data of each group. - - >>> for name, data in df.group_by("a"): # doctest: +SKIP - ... print(name) - ... print(data) - ... - a - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘ - b - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘ - c - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - - ''' - def rolling(self, index_column: IntoExpr) -> RollingGroupBy: - ''' - Create rolling groups based on a time, Int32, or Int64 column. - - Different from a `group_by_dynamic` the windows are now determined by the - individual values and are not of constant intervals. For constant intervals use - :func:`DataFrame.group_by_dynamic`. - - If you have a time series ``, then by default the - windows created will be - - * (t_0 - period, t_0] - * (t_1 - period, t_1] - * ... - * (t_n - period, t_n] - - whereas if you pass a non-default `offset`, then the windows will be - - * (t_0 + offset, t_0 + offset + period] - * (t_1 + offset, t_1 + offset + period] - * ... - * (t_n + offset, t_n + offset + period] - - The `period` and `offset` arguments are created either from a timedelta, or - by using the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a rolling operation on an integer column, the windows are defined by: - - - **"1i" # length 1** - - **"10i" # length 10** - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling operation on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - RollingGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - group_by_dynamic - - Examples - -------- - >>> dates = [ - ... "2020-01-01 13:45:48", - ... "2020-01-01 16:42:13", - ... "2020-01-01 16:45:09", - ... "2020-01-02 18:12:48", - ... "2020-01-03 19:45:32", - ... "2020-01-08 23:16:43", - ... ] - >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( - ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() - ... ) - >>> out = df.rolling(index_column="dt", period="2d").agg( - ... [ - ... pl.sum("a").alias("sum_a"), - ... pl.min("a").alias("min_a"), - ... pl.max("a").alias("max_a"), - ... ] - ... ) - >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] - >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] - >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] - >>> out - shape: (6, 4) - ┌─────────────────────┬───────┬───────┬───────┐ - │ dt ┆ sum_a ┆ min_a ┆ max_a │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞═════════════════════╪═══════╪═══════╪═══════╡ - │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ - │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ - │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ - │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ - │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ - │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ - └─────────────────────┴───────┴───────┴───────┘ - - ''' - def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - Time windows are calculated and rows are assigned to windows. Different from a - normal group by is that a row can be member of multiple groups. - By default, the windows look like: - - - [start, start + period) - - [start + every, start + every + period) - - [start + 2*every, start + 2*every + period) - - ... - - where `start` is determined by `start_by`, `offset`, and `every` (see parameter - descriptions below). - - .. warning:: - The index column must be sorted in ascending order. If `by` is passed, then - the index column must be sorted in ascending order within each group. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - - .. deprecated:: 0.19.4 - Use `label` instead. - include_boundaries - Add the lower and upper bound of the window to the "_lower_boundary" and - "_upper_boundary" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - label : {\'left\', \'right\', \'datapoint\'} - Define which label to use for the window: - - - \'left\': lower boundary of the window - - \'right\': upper boundary of the window - - \'datapoint\': the first value of the index column in the given window. - If you don\'t need the label to be at one of the boundaries, choose this - option for maximum performance - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - DynamicGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - rolling - - Notes - ----- - 1) If you\'re coming from pandas, then - - .. code-block:: python - - # polars - df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) - - is equivalent to - - .. code-block:: python - - # pandas - df.set_index("ts").resample("D")["value"].sum().reset_index() - - though note that, unlike pandas, polars doesn\'t add extra rows for empty - windows. If you need `index_column` to be evenly spaced, then please combine - with :func:`DataFrame.upsample`. - - 2) The `every`, `period` and `offset` arguments are created with - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a group_by_dynamic on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Examples - -------- - >>> from datetime import datetime - >>> df = pl.DataFrame( - ... { - ... "time": pl.datetime_range( - ... start=datetime(2021, 12, 16), - ... end=datetime(2021, 12, 16, 3), - ... interval="30m", - ... eager=True, - ... ), - ... "n": range(7), - ... } - ... ) - >>> df - shape: (7, 2) - ┌─────────────────────┬─────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i64 │ - ╞═════════════════════╪═════╡ - │ 2021-12-16 00:00:00 ┆ 0 │ - │ 2021-12-16 00:30:00 ┆ 1 │ - │ 2021-12-16 01:00:00 ┆ 2 │ - │ 2021-12-16 01:30:00 ┆ 3 │ - │ 2021-12-16 02:00:00 ┆ 4 │ - │ 2021-12-16 02:30:00 ┆ 5 │ - │ 2021-12-16 03:00:00 ┆ 6 │ - └─────────────────────┴─────┘ - - Group by windows of 1 hour starting at 2021-12-16 00:00:00. - - >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [1, 2] │ - │ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ 2021-12-16 02:00:00 ┆ [5, 6] │ - └─────────────────────┴───────────┘ - - The window boundaries can also be added to the aggregation result - - >>> df.group_by_dynamic( - ... "time", every="1h", include_boundaries=True, closed="right" - ... ).agg(pl.col("n").mean()) - shape: (4, 4) - ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ - │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ - ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ - │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ - │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ - │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ - │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ - └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ - - When closed="left", the window excludes the right end of interval: - [lower_bound, upper_bound) - - >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-16 00:00:00 ┆ [0, 1] │ - │ 2021-12-16 01:00:00 ┆ [2, 3] │ - │ 2021-12-16 02:00:00 ┆ [4, 5] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - When closed="both" the time values at the window boundaries belong to 2 groups. - - >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) - shape: (5, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ - │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - Dynamic group bys can also be combined with grouping on normal keys - - >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) - >>> df - shape: (7, 3) - ┌─────────────────────┬─────┬────────┐ - │ time ┆ n ┆ groups │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ str │ - ╞═════════════════════╪═════╪════════╡ - │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ - │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ - │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ - │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ - │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ - │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ - │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ - └─────────────────────┴─────┴────────┘ - >>> df.group_by_dynamic( - ... "time", - ... every="1h", - ... closed="both", - ... by="groups", - ... include_boundaries=True, - ... ).agg(pl.col("n")) - shape: (7, 5) - ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ - │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ - ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ - │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ - │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ - │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ - │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ - │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ - └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ - - Dynamic group by on an index column - - >>> df = pl.DataFrame( - ... { - ... "idx": pl.int_range(0, 6, eager=True), - ... "A": ["A", "A", "B", "B", "B", "C"], - ... } - ... ) - >>> ( - ... df.group_by_dynamic( - ... "idx", - ... every="2i", - ... period="3i", - ... include_boundaries=True, - ... closed="right", - ... ).agg(pl.col("A").alias("A_agg_list")) - ... ) - shape: (4, 4) - ┌─────────────────┬─────────────────┬─────┬─────────────────┐ - │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 ┆ list[str] │ - ╞═════════════════╪═════════════════╪═════╪═════════════════╡ - │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ - │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ - │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ - │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ - └─────────────────┴─────────────────┴─────┴─────────────────┘ - - ''' - def upsample(self, time_column: str) -> Self: - ''' - Upsample a DataFrame at a regular frequency. - - The `every` and `offset` arguments are created with - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - - - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - Parameters - ---------- - time_column - time column will be used to determine a date_range. - Note that this column has to be sorted for the output to make sense. - every - interval will start \'every\' duration - offset - change the start of the date_range by this offset. - by - First group by these columns and then upsample for every group - maintain_order - Keep the ordering predictable. This is slower. - - Returns - ------- - DataFrame - Result will be sorted by `time_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - Examples - -------- - Upsample a DataFrame by a certain interval. - - >>> from datetime import datetime - >>> df = pl.DataFrame( - ... { - ... "time": [ - ... datetime(2021, 2, 1), - ... datetime(2021, 4, 1), - ... datetime(2021, 5, 1), - ... datetime(2021, 6, 1), - ... ], - ... "groups": ["A", "B", "A", "B"], - ... "values": [0, 1, 2, 3], - ... } - ... ).set_sorted("time") - >>> df.upsample( - ... time_column="time", every="1mo", by="groups", maintain_order=True - ... ).select(pl.all().forward_fill()) - shape: (7, 3) - ┌─────────────────────┬────────┬────────┐ - │ time ┆ groups ┆ values │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ str ┆ i64 │ - ╞═════════════════════╪════════╪════════╡ - │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ - │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ - │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ - │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ - │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ - │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ - │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ - └─────────────────────┴────────┴────────┘ - - ''' - def join_asof(self, other: DataFrame) -> DataFrame: - ''' - Perform an asof join. - - This is similar to a left-join except that we match on nearest key rather than - equal keys. - - Both DataFrames must be sorted by the asof_join key. - - For each row in the left DataFrame: - - - A "backward" search selects the last row in the right DataFrame whose - \'on\' key is less than or equal to the left\'s key. - - - A "forward" search selects the first row in the right DataFrame whose - \'on\' key is greater than or equal to the left\'s key. - - - A "nearest" search selects the last row in the right DataFrame whose value - is nearest to the left\'s key. String keys are not currently supported for a - nearest search. - - The default is "backward". - - Parameters - ---------- - other - Lazy DataFrame to join with. - left_on - Join column of the left DataFrame. - right_on - Join column of the right DataFrame. - on - Join column of both DataFrames. If set, `left_on` and `right_on` should be - None. - by - join on these columns before doing asof join - by_left - join on these columns before doing asof join - by_right - join on these columns before doing asof join - strategy : {\'backward\', \'forward\', \'nearest\'} - Join strategy. - suffix - Suffix to append to columns with a duplicate name. - tolerance - Numeric tolerance. By setting this the join will only be done if the near - keys are within this distance. If an asof join is done on columns of dtype - "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta - object or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - allow_parallel - Allow the physical plan to optionally evaluate the computation of both - DataFrames up to the join in parallel. - force_parallel - Force the physical plan to evaluate the computation of both DataFrames up to - the join in parallel. - - Examples - -------- - >>> from datetime import datetime - >>> gdp = pl.DataFrame( - ... { - ... "date": [ - ... datetime(2016, 1, 1), - ... datetime(2017, 1, 1), - ... datetime(2018, 1, 1), - ... datetime(2019, 1, 1), - ... ], # note record date: Jan 1st (sorted!) - ... "gdp": [4164, 4411, 4566, 4696], - ... } - ... ).set_sorted("date") - >>> population = pl.DataFrame( - ... { - ... "date": [ - ... datetime(2016, 5, 12), - ... datetime(2017, 5, 12), - ... datetime(2018, 5, 12), - ... datetime(2019, 5, 12), - ... ], # note record date: May 12th (sorted!) - ... "population": [82.19, 82.66, 83.12, 83.52], - ... } - ... ).set_sorted("date") - >>> population.join_asof(gdp, on="date", strategy="backward") - shape: (4, 3) - ┌─────────────────────┬────────────┬──────┐ - │ date ┆ population ┆ gdp │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ f64 ┆ i64 │ - ╞═════════════════════╪════════════╪══════╡ - │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ - │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ - │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ - │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ - └─────────────────────┴────────────┴──────┘ - - ''' - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: - ''' - Join in SQL-like fashion. - - Parameters - ---------- - other - DataFrame to join with. - on - Name(s) of the join columns in both DataFrames. - how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} - Join strategy. - - .. note:: - A left join preserves the row order of the left DataFrame. - left_on - Name(s) of the left join column(s). - right_on - Name(s) of the right join column(s). - suffix - Suffix to append to columns with a duplicate name. - validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} - Checks if join is of specified type. - - * *many_to_many* - “m:m”: default, does not result in checks - * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets - * *one_to_many* - “1:m”: check if join keys are unique in left dataset - * *many_to_one* - “m:1”: check if join keys are unique in right dataset - - .. note:: - - - This is currently not supported the streaming engine. - - This is only supported when joined by single columns. - - Returns - ------- - DataFrame - - See Also - -------- - join_asof - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> other_df = pl.DataFrame( - ... { - ... "apple": ["x", "y", "z"], - ... "ham": ["a", "b", "d"], - ... } - ... ) - >>> df.join(other_df, on="ham") - shape: (2, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - └─────┴─────┴─────┴───────┘ - - >>> df.join(other_df, on="ham", how="outer") - shape: (4, 4) - ┌──────┬──────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞══════╪══════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ null ┆ null ┆ d ┆ z │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └──────┴──────┴─────┴───────┘ - - >>> df.join(other_df, on="ham", how="left") - shape: (3, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └─────┴─────┴─────┴───────┘ - - >>> df.join(other_df, on="ham", how="semi") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 7.0 ┆ b │ - └─────┴─────┴─────┘ - - >>> df.join(other_df, on="ham", how="anti") - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - Notes - ----- - For joining on columns with categorical data, see `pl.StringCache()`. - - ''' - def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: - ''' - Apply a custom/user-defined function (UDF) over the rows of the DataFrame. - - .. warning:: - This method is much slower than the native expressions API. - Only use it if you cannot implement your logic otherwise. - - The UDF will receive each row as a tuple of values: `udf(row)`. - - Implementing logic using a Python function is almost always *significantly* - slower and more memory intensive than implementing the same logic using - the native expression API because: - - - The native expression engine runs in Rust; UDFs run in Python. - - Use of Python UDFs forces the DataFrame to be materialized in memory. - - Polars-native expressions can be parallelised (UDFs typically cannot). - - Polars-native expressions can be logically optimised (UDFs cannot). - - Wherever possible you should strongly prefer the native expression API - to achieve the best performance. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output type of the operation. If none given, Polars tries to infer the type. - inference_size - Only used in the case when the custom function returns rows. - This uses the first `n` rows to determine the output schema. - - Notes - ----- - * The frame-level `apply` cannot track column names (as the UDF is a black-box - that may arbitrarily drop, rearrange, transform, or add new columns); if you - want to apply a UDF such that column names are preserved, you should use the - expression-level `apply` syntax instead. - - * If your function is expensive and you don\'t want it to be called more than - once for a given input, consider applying an `@lru_cache` decorator to it. - If your data is suitable you may achieve *significant* speedups. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) - - Return a DataFrame by mapping each row to a tuple: - - >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) - shape: (3, 2) - ┌──────────┬──────────┐ - │ column_0 ┆ column_1 │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════════╪══════════╡ - │ 2 ┆ -3 │ - │ 4 ┆ 15 │ - │ 6 ┆ 24 │ - └──────────┴──────────┘ - - However, it is much better to implement this with a native expression: - - >>> df.select( - ... pl.col("foo") * 2, - ... pl.col("bar") * 3, - ... ) # doctest: +IGNORE_RESULT - - Return a DataFrame with a single column by mapping each row to a scalar: - - >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP - shape: (3, 1) - ┌───────┐ - │ apply │ - │ --- │ - │ i64 │ - ╞═══════╡ - │ 1 │ - │ 9 │ - │ 14 │ - └───────┘ - - In this case it is better to use the following native expression: - - >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT - - ''' - def hstack(self, columns: list[Series] | DataFrame) -> Self: - ''' - Return a new DataFrame grown horizontally by stacking multiple Series to it. - - Parameters - ---------- - columns - Series to stack. - in_place - Modify in place. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> x = pl.Series("apple", [10, 20, 30]) - >>> df.hstack([x]) - shape: (3, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6 ┆ a ┆ 10 │ - │ 2 ┆ 7 ┆ b ┆ 20 │ - │ 3 ┆ 8 ┆ c ┆ 30 │ - └─────┴─────┴─────┴───────┘ - - ''' - def vstack(self, other: DataFrame) -> Self: - ''' - Grow this DataFrame vertically by stacking a DataFrame to it. - - Parameters - ---------- - other - DataFrame to stack. - in_place - Modify in place. - - See Also - -------- - extend - - Examples - -------- - >>> df1 = pl.DataFrame( - ... { - ... "foo": [1, 2], - ... "bar": [6, 7], - ... "ham": ["a", "b"], - ... } - ... ) - >>> df2 = pl.DataFrame( - ... { - ... "foo": [3, 4], - ... "bar": [8, 9], - ... "ham": ["c", "d"], - ... } - ... ) - >>> df1.vstack(df2) - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - │ 4 ┆ 9 ┆ d │ - └─────┴─────┴─────┘ - - ''' - def extend(self, other: DataFrame) -> Self: - ''' - Extend the memory backed by this `DataFrame` with the values from `other`. - - Different from `vstack` which adds the chunks from `other` to the chunks of - this `DataFrame`, `extend` appends the data from `other` to the underlying - memory locations and thus may cause a reallocation. - - If this does not cause a reallocation, the resulting data structure will not - have any extra chunks and thus will yield faster queries. - - Prefer `extend` over `vstack` when you want to do a query after a single - append. For instance, during online operations where you add `n` rows and rerun - a query. - - Prefer `vstack` over `extend` when you want to append many times before - doing a query. For instance, when you read in multiple files and want to store - them in a single `DataFrame`. In the latter case, finish the sequence of - `vstack` operations with a `rechunk`. - - Parameters - ---------- - other - DataFrame to vertically add. - - Warnings - -------- - This method modifies the dataframe in-place. The dataframe is returned for - convenience only. - - See Also - -------- - vstack - - Examples - -------- - >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) - >>> df1.extend(df2) - shape: (6, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 5 │ - │ 3 ┆ 6 │ - │ 10 ┆ 40 │ - │ 20 ┆ 50 │ - │ 30 ┆ 60 │ - └─────┴─────┘ - - ''' - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: - ''' - Remove columns from the dataframe. - - Parameters - ---------- - columns - Names of the columns that should be removed from the dataframe, or - a selector that determines the columns to drop. - *more_columns - Additional columns to drop, specified as positional arguments. - - Examples - -------- - Drop a single column by passing the name of that column. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.drop("ham") - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪═════╡ - │ 1 ┆ 6.0 │ - │ 2 ┆ 7.0 │ - │ 3 ┆ 8.0 │ - └─────┴─────┘ - - Drop multiple columns by passing a list of column names. - - >>> df.drop(["bar", "ham"]) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - Drop multiple columns by passing a selector. - - >>> import polars.selectors as cs - >>> df.drop(cs.numeric()) - shape: (3, 1) - ┌─────┐ - │ ham │ - │ --- │ - │ str │ - ╞═════╡ - │ a │ - │ b │ - │ c │ - └─────┘ - - Use positional arguments to drop multiple columns. - - >>> df.drop("foo", "ham") - shape: (3, 1) - ┌─────┐ - │ bar │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 6.0 │ - │ 7.0 │ - │ 8.0 │ - └─────┘ - - ''' - def drop_in_place(self, name: str) -> Series: - ''' - Drop a single column in-place and return the dropped column. - - Parameters - ---------- - name - Name of the column to drop. - - Returns - ------- - Series - The dropped column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.drop_in_place("ham") - shape: (3,) - Series: \'ham\' [str] - [ - "a" - "b" - "c" - ] - - ''' - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: - ''' - Cast DataFrame column(s) to the specified dtype(s). - - Parameters - ---------- - dtypes - Mapping of column names (or selector) to dtypes, or a single dtype - to which all columns will be cast. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], - ... } - ... ) - - Cast specific frame columns to the specified dtypes: - - >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ u8 ┆ date │ - ╞═════╪═════╪════════════╡ - │ 1.0 ┆ 6 ┆ 2020-01-02 │ - │ 2.0 ┆ 7 ┆ 2021-03-04 │ - │ 3.0 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - Cast all frame columns to the specified dtype: - - >>> df.cast(pl.Utf8).to_dict(as_series=False) - {\'foo\': [\'1\', \'2\', \'3\'], - \'bar\': [\'6.0\', \'7.0\', \'8.0\'], - \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} - - Use selectors to define the columns being cast: - - >>> import polars.selectors as cs - >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ str │ - ╞═════╪═════╪════════════╡ - │ 1 ┆ 6 ┆ 2020-01-02 │ - │ 2 ┆ 7 ┆ 2021-03-04 │ - │ 3 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - ''' - def clear(self, n: int = ...) -> Self: - ''' - Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. - - Returns a `n`-row null-filled DataFrame with an identical schema. - `n` can be greater than the current number of rows in the DataFrame. - - Parameters - ---------- - n - Number of (null-filled) rows to return in the cleared frame. - - See Also - -------- - clone : Cheap deepcopy/clone. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> df.clear() - shape: (0, 3) - ┌─────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞═════╪═════╪══════╡ - └─────┴─────┴──────┘ - - >>> df.clear(n=2) - shape: (2, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪══════╪══════╡ - │ null ┆ null ┆ null │ - │ null ┆ null ┆ null │ - └──────┴──────┴──────┘ - - ''' - def clone(self) -> Self: - ''' - Create a copy of this DataFrame. - - This is a cheap operation that does not copy data. - - See Also - -------- - clear : Create an empty copy of the current DataFrame, with identical - schema but no data. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.clone() - shape: (4, 3) - ┌─────┬──────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true │ - │ 2 ┆ 4.0 ┆ true │ - │ 3 ┆ 10.0 ┆ false │ - │ 4 ┆ 13.0 ┆ true │ - └─────┴──────┴───────┘ - - ''' - def get_columns(self) -> list[Series]: - ''' - Get the DataFrame as a List of Series. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.get_columns() - [shape: (3,) - Series: \'foo\' [i64] - [ - 1 - 2 - 3 - ], shape: (3,) - Series: \'bar\' [i64] - [ - 4 - 5 - 6 - ]] - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.get_columns() - [shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - ], shape: (4,) - Series: \'b\' [f64] - [ - 0.5 - 4.0 - 10.0 - 13.0 - ], shape: (4,) - Series: \'c\' [bool] - [ - true - true - false - true - ]] - - ''' - def get_column(self, name: str) -> Series: - ''' - Get a single column by name. - - Parameters - ---------- - name : str - Name of the column to retrieve. - - Returns - ------- - Series - - See Also - -------- - to_series - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.get_column("foo") - shape: (3,) - Series: \'foo\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: - ''' - Fill null values using the specified value or strategy. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - matches_supertype - Fill all matching supertype of the fill `value`. - - Returns - ------- - DataFrame - DataFrame with None values replaced by the filling strategy. - - See Also - -------- - fill_nan - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 4], - ... "b": [0.5, 4, None, 13], - ... } - ... ) - >>> df.fill_null(99) - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 99 ┆ 99.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - >>> df.fill_null(strategy="forward") - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> df.fill_null(strategy="max") - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> df.fill_null(strategy="zero") - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 0 ┆ 0.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - ''' - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: - ''' - Fill floating point NaN values by an Expression evaluation. - - Parameters - ---------- - value - Value with which to replace NaN values. - - Returns - ------- - DataFrame - DataFrame with NaN values replaced by the given value. - - Warnings - -------- - Note that floating point NaNs (Not a Number) are not missing values! - To replace missing values, use :func:`fill_null`. - - See Also - -------- - fill_null - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1.5, 2, float("nan"), 4], - ... "b": [0.5, 4, float("nan"), 13], - ... } - ... ) - >>> df.fill_nan(99) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪══════╡ - │ 1.5 ┆ 0.5 │ - │ 2.0 ┆ 4.0 │ - │ 99.0 ┆ 99.0 │ - │ 4.0 ┆ 13.0 │ - └──────┴──────┘ - - ''' - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: - ''' - Explode the dataframe to long format by exploding the given columns. - - Parameters - ---------- - columns - Column names, expressions, or a selector defining them. The underlying - columns being exploded must be of List or Utf8 datatype. - *more_columns - Additional names of columns to explode, specified as positional arguments. - - Returns - ------- - DataFrame - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "letters": ["a", "a", "b", "c"], - ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], - ... } - ... ) - >>> df - shape: (4, 2) - ┌─────────┬───────────┐ - │ letters ┆ numbers │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════════╪═══════════╡ - │ a ┆ [1] │ - │ a ┆ [2, 3] │ - │ b ┆ [4, 5] │ - │ c ┆ [6, 7, 8] │ - └─────────┴───────────┘ - >>> df.explode("numbers") - shape: (8, 2) - ┌─────────┬─────────┐ - │ letters ┆ numbers │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════════╪═════════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ a ┆ 3 │ - │ b ┆ 4 │ - │ b ┆ 5 │ - │ c ┆ 6 │ - │ c ┆ 7 │ - │ c ┆ 8 │ - └─────────┴─────────┘ - - ''' - def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: - ''' - Create a spreadsheet-style pivot table as a DataFrame. - - Only available in eager mode. See "Examples" section below for how to do a - "lazy pivot" if you know the unique column values in advance. - - Parameters - ---------- - values - Column values to aggregate. Can be multiple columns if the *columns* - arguments contains multiple columns as well. - index - One or multiple keys to group by. - columns - Name of the column(s) whose values will be used as the header of the output - DataFrame. - aggregate_function - Choose from: - - - None: no aggregation takes place, will raise error if multiple values are in group. - - A predefined aggregate function string, one of - {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} - - An expression to do the aggregation. - - maintain_order - Sort the grouped keys so that the output order is predictable. - sort_columns - Sort the transposed columns by name. Default is by order of discovery. - separator - Used as separator/delimiter in generated column names. - - Returns - ------- - DataFrame - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": ["one", "one", "two", "two", "one", "two"], - ... "bar": ["y", "y", "y", "x", "x", "x"], - ... "baz": [1, 2, 3, 4, 5, 6], - ... } - ... ) - >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ y ┆ x │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ one ┆ 3 ┆ 5 │ - │ two ┆ 3 ┆ 10 │ - └─────┴─────┴─────┘ - - Pivot using selectors to determine the index/values/columns: - - >>> import polars.selectors as cs - >>> df.pivot( - ... values=cs.numeric(), - ... index=cs.string(), - ... columns=cs.string(), - ... aggregate_function="sum", - ... sort_columns=True, - ... ).sort( - ... by=cs.string(), - ... ) - shape: (4, 6) - ┌─────┬─────┬──────┬──────┬──────┬──────┐ - │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪══════╪══════╪══════╪══════╡ - │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ - │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ - │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ - │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ - └─────┴─────┴──────┴──────┴──────┴──────┘ - - Run an expression as aggregation function - - >>> df = pl.DataFrame( - ... { - ... "col1": ["a", "a", "a", "b", "b", "b"], - ... "col2": ["x", "x", "x", "x", "y", "y"], - ... "col3": [6, 7, 3, 2, 5, 7], - ... } - ... ) - >>> df.pivot( - ... index="col1", - ... columns="col2", - ... values="col3", - ... aggregate_function=pl.element().tanh().mean(), - ... ) - shape: (2, 3) - ┌──────┬──────────┬──────────┐ - │ col1 ┆ x ┆ y │ - │ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 │ - ╞══════╪══════════╪══════════╡ - │ a ┆ 0.998347 ┆ null │ - │ b ┆ 0.964028 ┆ 0.999954 │ - └──────┴──────────┴──────────┘ - - Note that `pivot` is only available in eager mode. If you know the unique - column values in advance, you can use :meth:`polars.LazyFrame.groupby` to - get the same result as above in lazy mode: - - >>> index = pl.col("col1") - >>> columns = pl.col("col2") - >>> values = pl.col("col3") - >>> unique_column_values = ["x", "y"] - >>> aggregate_function = lambda col: col.tanh().mean() - >>> ( - ... df.lazy() - ... .group_by(index) - ... .agg( - ... *[ - ... aggregate_function(values.filter(columns == value)).alias(value) - ... for value in unique_column_values - ... ] - ... ) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - shape: (2, 3) - ┌──────┬──────────┬──────────┐ - │ col1 ┆ x ┆ y │ - │ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 │ - ╞══════╪══════════╪══════════╡ - │ a ┆ 0.998347 ┆ null │ - │ b ┆ 0.964028 ┆ 0.999954 │ - └──────┴──────────┴──────────┘ - - ''' - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: - ''' - Unpivot a DataFrame from wide to long format. - - Optionally leaves identifiers set. - - This function is useful to massage a DataFrame into a format where one or more - columns are identifier variables (id_vars) while all other columns, considered - measured variables (value_vars), are "unpivoted" to the row axis leaving just - two non-identifier columns, \'variable\' and \'value\'. - - Parameters - ---------- - id_vars - Column(s) or selector(s) to use as identifier variables. - value_vars - Column(s) or selector(s) to use as values variables; if `value_vars` - is empty all columns that are not in `id_vars` will be used. - variable_name - Name to give to the `variable` column. Defaults to "variable" - value_name - Name to give to the `value` column. Defaults to "value" - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["x", "y", "z"], - ... "b": [1, 3, 5], - ... "c": [2, 4, 6], - ... } - ... ) - >>> import polars.selectors as cs - >>> df.melt(id_vars="a", value_vars=cs.numeric()) - shape: (6, 3) - ┌─────┬──────────┬───────┐ - │ a ┆ variable ┆ value │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 │ - ╞═════╪══════════╪═══════╡ - │ x ┆ b ┆ 1 │ - │ y ┆ b ┆ 3 │ - │ z ┆ b ┆ 5 │ - │ x ┆ c ┆ 2 │ - │ y ┆ c ┆ 4 │ - │ z ┆ c ┆ 6 │ - └─────┴──────────┴───────┘ - - ''' - def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: - ''' - Unstack a long table to a wide form without doing an aggregation. - - This can be much faster than a pivot, because it can skip the grouping phase. - - Warnings - -------- - This functionality is experimental and may be subject to changes - without it being considered a breaking change. - - Parameters - ---------- - step - Number of rows in the unstacked frame. - how : { \'vertical\', \'horizontal\' } - Direction of the unstack. - columns - Column name(s) or selector(s) to include in the operation. - If set to `None` (default), use all columns. - fill_values - Fill values that don\'t fit the new size with this value. - - Examples - -------- - >>> from string import ascii_uppercase - >>> df = pl.DataFrame( - ... { - ... "x": list(ascii_uppercase[0:8]), - ... "y": pl.int_range(1, 9, eager=True), - ... } - ... ).with_columns( - ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), - ... ) - >>> df - shape: (8, 3) - ┌─────┬─────┬──────────┐ - │ x ┆ y ┆ z │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ list[u8] │ - ╞═════╪═════╪══════════╡ - │ A ┆ 1 ┆ [1, 2] │ - │ B ┆ 2 ┆ [2, 3] │ - │ C ┆ 3 ┆ [3, 4] │ - │ D ┆ 4 ┆ [4, 5] │ - │ E ┆ 5 ┆ [5, 6] │ - │ F ┆ 6 ┆ [6, 7] │ - │ G ┆ 7 ┆ [7, 8] │ - │ H ┆ 8 ┆ [8, 9] │ - └─────┴─────┴──────────┘ - >>> df.unstack(step=4, how="vertical") - shape: (4, 6) - ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ - │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ - ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ - │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ - │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ - │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ - │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ - └─────┴─────┴─────┴─────┴──────────┴──────────┘ - >>> df.unstack(step=2, how="horizontal") - shape: (4, 6) - ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ - │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ - ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ - │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ - │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ - │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ - │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ - └─────┴─────┴─────┴─────┴──────────┴──────────┘ - >>> import polars.selectors as cs - >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) - shape: (5, 2) - ┌─────┬─────┐ - │ y_0 ┆ y_1 │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 6 │ - │ 2 ┆ 7 │ - │ 3 ┆ 8 │ - │ 4 ┆ 0 │ - │ 5 ┆ 0 │ - └─────┴─────┘ - - ''' - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: - ''' - Group by the given columns and return the groups as separate dataframes. - - Parameters - ---------- - by - Column name(s) or selector(s) to group by. - *more_by - Additional names of columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default partition by operation. - include_key - Include the columns used to partition the DataFrame in the output. - as_dict - Return a dictionary instead of a list. The dictionary keys are the distinct - group values that identify that group. - - Examples - -------- - Pass a single column name to partition by that column. - - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "c"], - ... "b": [1, 2, 1, 3, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> df.partition_by("a") # doctest: +IGNORE_RESULT - [shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘, - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘] - - Partition by multiple columns by either passing a list of column names, or by - specifying each column name as a positional argument. - - >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT - [shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘] - - Return the partitions as a dictionary by specifying `as_dict=True`. - - >>> import polars.selectors as cs - >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT - {\'a\': shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘, - \'b\': shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘, - \'c\': shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘} - - ''' - def shift(self, n: int = ...) -> DataFrame: - ''' - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. Accepts expression input. - Non-expression inputs are parsed as literals. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [5, 6, 7, 8], - ... } - ... ) - >>> df.shift() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ null ┆ null │ - │ 1 ┆ 5 │ - │ 2 ┆ 6 │ - │ 3 ┆ 7 │ - └──────┴──────┘ - - Pass a negative value to shift in the opposite direction instead. - - >>> df.shift(-2) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ null ┆ null │ - │ null ┆ null │ - └──────┴──────┘ - - Specify `fill_value` to fill the resulting null values. - - >>> df.shift(-2, fill_value=100) - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ 100 ┆ 100 │ - │ 100 ┆ 100 │ - └─────┴─────┘ - - ''' - def is_duplicated(self) -> Series: - ''' - Get a mask of all duplicated rows in this DataFrame. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) - >>> df.is_duplicated() - shape: (4,) - Series: \'\' [bool] - [ - true - false - false - true - ] - - This mask can be used to visualize the duplicated lines like this: - - >>> df.filter(df.is_duplicated()) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ str │ - ╞═════╪═════╡ - │ 1 ┆ x │ - │ 1 ┆ x │ - └─────┴─────┘ - ''' - def is_unique(self) -> Series: - ''' - Get a mask of all unique rows in this DataFrame. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) - >>> df.is_unique() - shape: (4,) - Series: \'\' [bool] - [ - false - true - true - false - ] - - This mask can be used to visualize the unique lines like this: - - >>> df.filter(df.is_unique()) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ str │ - ╞═════╪═════╡ - │ 2 ┆ y │ - │ 3 ┆ z │ - └─────┴─────┘ - ''' - def lazy(self) -> LazyFrame: - ''' - Start a lazy query from this point. This returns a `LazyFrame` object. - - Operations on a `LazyFrame` are not executed until this is requested by either - calling: - - * :meth:`.fetch() ` - (run on a small number of rows) - * :meth:`.collect() ` - (run on all data) - * :meth:`.describe_plan() ` - (print unoptimized query plan) - * :meth:`.describe_optimized_plan() ` - (print optimized query plan) - * :meth:`.show_graph() ` - (show (un)optimized query plan as graphviz graph) - - Lazy operations are advised because they allow for query optimization and more - parallelization. - - Returns - ------- - LazyFrame - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> df.lazy() # doctest: +ELLIPSIS - - - ''' - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - ''' - Select columns from this DataFrame. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Examples - -------- - Pass the name of a column to select that column. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.select("foo") - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - Multiple columns can be selected by passing a list of column names. - - >>> df.select(["foo", "bar"]) - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 6 │ - │ 2 ┆ 7 │ - │ 3 ┆ 8 │ - └─────┴─────┘ - - Multiple columns can also be selected using positional arguments instead of a - list. Expressions are also accepted. - - >>> df.select(pl.col("foo"), pl.col("bar") + 1) - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - └─────┴─────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) - shape: (3, 1) - ┌───────────┐ - │ threshold │ - │ --- │ - │ i32 │ - ╞═══════════╡ - │ 0 │ - │ 0 │ - │ 10 │ - └───────────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... df.select( - ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), - ... ) - ... - shape: (3, 1) - ┌───────────┐ - │ is_odd │ - │ --- │ - │ struct[2] │ - ╞═══════════╡ - │ {1,0} │ - │ {0,1} │ - │ {1,0} │ - └───────────┘ - - ''' - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - """ - Select columns from this LazyFrame. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - See Also - -------- - select - - """ - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - ''' - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - DataFrame - A new DataFrame with the columns added. - - Notes - ----- - Creating a new DataFrame using this method does not create a new copy of - existing data. - - Examples - -------- - Pass an expression to add it as a new column. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) - shape: (4, 4) - ┌─────┬──────┬───────┬──────┐ - │ a ┆ b ┆ c ┆ a^2 │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 │ - ╞═════╪══════╪═══════╪══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ - └─────┴──────┴───────┴──────┘ - - Added columns will replace existing columns with the same name. - - >>> df.with_columns(pl.col("a").cast(pl.Float64)) - shape: (4, 3) - ┌─────┬──────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╡ - │ 1.0 ┆ 0.5 ┆ true │ - │ 2.0 ┆ 4.0 ┆ true │ - │ 3.0 ┆ 10.0 ┆ false │ - │ 4.0 ┆ 13.0 ┆ true │ - └─────┴──────┴───────┘ - - Multiple columns can be added by passing a list of expressions. - - >>> df.with_columns( - ... [ - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ] - ... ) - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Multiple columns also can be added using positional arguments instead of a list. - - >>> df.with_columns( - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ) - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> df.with_columns( - ... ab=pl.col("a") * pl.col("b"), - ... not_c=pl.col("c").not_(), - ... ) - shape: (4, 5) - ┌─────┬──────┬───────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ ab ┆ not_c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ - └─────┴──────┴───────┴──────┴───────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... df.drop("c").with_columns( - ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), - ... ) - ... - shape: (4, 3) - ┌─────┬──────┬─────────────┐ - │ a ┆ b ┆ diffs │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ struct[2] │ - ╞═════╪══════╪═════════════╡ - │ 1 ┆ 0.5 ┆ {null,null} │ - │ 2 ┆ 4.0 ┆ {1,3.5} │ - │ 3 ┆ 10.0 ┆ {1,6.0} │ - │ 4 ┆ 13.0 ┆ {1,3.0} │ - └─────┴──────┴─────────────┘ - - ''' - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - """ - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - LazyFrame - A new LazyFrame with the columns added. - - See Also - -------- - with_columns - - """ - def n_chunks(self, strategy: str = ...) -> int | list[int]: - ''' - Get number of chunks used by the ChunkedArrays of this DataFrame. - - Parameters - ---------- - strategy : {\'first\', \'all\'} - Return the number of chunks of the \'first\' column, - or \'all\' columns in this DataFrame. - - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.n_chunks() - 1 - >>> df.n_chunks(strategy="all") - [1, 1, 1] - - ''' - def max(self, axis: int | None = ...) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their maximum value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`max_horizontal`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.max() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def max_horizontal(self) -> Series: - ''' - Get the maximum value horizontally across columns. - - Returns - ------- - Series - A Series named `"max"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.max_horizontal() - shape: (3,) - Series: \'max\' [f64] - [ - 4.0 - 5.0 - 6.0 - ] - ''' - def min(self, axis: int | None = ...) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their minimum value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`min_horizontal`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.min() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - ''' - def min_horizontal(self) -> Series: - ''' - Get the minimum value horizontally across columns. - - Returns - ------- - Series - A Series named `"min"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.min_horizontal() - shape: (3,) - Series: \'min\' [f64] - [ - 1.0 - 2.0 - 3.0 - ] - ''' - def sum(self) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their sum value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`sum_horizontal`. - null_strategy : {\'ignore\', \'propagate\'} - This argument is only used if `axis == 1`. - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.sum() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 6 ┆ 21 ┆ null │ - └─────┴─────┴──────┘ - ''' - def sum_horizontal(self) -> Series: - ''' - Sum all values horizontally across columns. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - If set to `False`, any null value in the input will lead to a null output. - - Returns - ------- - Series - A Series named `"sum"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.sum_horizontal() - shape: (3,) - Series: \'sum\' [f64] - [ - 5.0 - 7.0 - 9.0 - ] - ''' - def mean(self) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their mean value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`mean_horizontal`. - null_strategy : {\'ignore\', \'propagate\'} - This argument is only used if `axis == 1`. - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... "spam": [True, False, None], - ... } - ... ) - >>> df.mean() - shape: (1, 4) - ┌─────┬─────┬──────┬──────┐ - │ foo ┆ bar ┆ ham ┆ spam │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str ┆ f64 │ - ╞═════╪═════╪══════╪══════╡ - │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ - └─────┴─────┴──────┴──────┘ - ''' - def mean_horizontal(self) -> Series: - ''' - Take the mean of all values horizontally across columns. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - If set to `False`, any null value in the input will lead to a null output. - - Returns - ------- - Series - A Series named `"mean"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.mean_horizontal() - shape: (3,) - Series: \'mean\' [f64] - [ - 2.5 - 3.5 - 4.5 - ] - ''' - def std(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns of this DataFrame to their standard deviation value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.std() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1.0 ┆ 1.0 ┆ null │ - └─────┴─────┴──────┘ - >>> df.std(ddof=0) - shape: (1, 3) - ┌──────────┬──────────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞══════════╪══════════╪══════╡ - │ 0.816497 ┆ 0.816497 ┆ null │ - └──────────┴──────────┴──────┘ - - ''' - def var(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns of this DataFrame to their variance value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.var() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1.0 ┆ 1.0 ┆ null │ - └─────┴─────┴──────┘ - >>> df.var(ddof=0) - shape: (1, 3) - ┌──────────┬──────────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞══════════╪══════════╪══════╡ - │ 0.666667 ┆ 0.666667 ┆ null │ - └──────────┴──────────┴──────┘ - - ''' - def median(self) -> Self: - ''' - Aggregate the columns of this DataFrame to their median value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.median() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 2.0 ┆ 7.0 ┆ null │ - └─────┴─────┴──────┘ - - ''' - def product(self) -> DataFrame: - ''' - Aggregate the columns of this DataFrame to their product values. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": [0.5, 4, 10], - ... "c": [True, True, False], - ... } - ... ) - - >>> df.product() - shape: (1, 3) - ┌─────┬──────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ i64 │ - ╞═════╪══════╪═════╡ - │ 6 ┆ 20.0 ┆ 0 │ - └─────┴──────┴─────┘ - - ''' - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: - ''' - Aggregate the columns of this DataFrame to their quantile value. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.quantile(0.5, "nearest") - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 2.0 ┆ 7.0 ┆ null │ - └─────┴─────┴──────┘ - - ''' - def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: - ''' - Convert categorical variables into dummy/indicator variables. - - Parameters - ---------- - columns - Column name(s) or selector(s) that should be converted to dummy - variables. If set to `None` (default), convert all columns. - separator - Separator/delimiter used when generating column names. - drop_first - Remove the first category from the variables being encoded. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2], - ... "bar": [3, 4], - ... "ham": ["a", "b"], - ... } - ... ) - >>> df.to_dummies() - shape: (2, 6) - ┌───────┬───────┬───────┬───────┬───────┬───────┐ - │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ - ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ - │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ - │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ - └───────┴───────┴───────┴───────┴───────┴───────┘ - - >>> df.to_dummies(drop_first=True) - shape: (2, 3) - ┌───────┬───────┬───────┐ - │ foo_2 ┆ bar_4 ┆ ham_b │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 │ - ╞═══════╪═══════╪═══════╡ - │ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1 ┆ 1 │ - └───────┴───────┴───────┘ - - >>> import polars.selectors as cs - >>> df.to_dummies(cs.integer(), separator=":") - shape: (2, 5) - ┌───────┬───────┬───────┬───────┬─────┐ - │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ - ╞═══════╪═══════╪═══════╪═══════╪═════╡ - │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ - │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ - └───────┴───────┴───────┴───────┴─────┘ - - >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") - shape: (2, 3) - ┌───────┬───────┬─────┐ - │ foo:2 ┆ bar:4 ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ str │ - ╞═══════╪═══════╪═════╡ - │ 0 ┆ 0 ┆ a │ - │ 1 ┆ 1 ┆ b │ - └───────┴───────┴─────┘ - - ''' - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: - ''' - Drop duplicate rows from this dataframe. - - Parameters - ---------- - subset - Column name(s) or selector(s), to consider when identifying - duplicate rows. If set to `None` (default), use all columns. - keep : {\'first\', \'last\', \'any\', \'none\'} - Which of the duplicate rows to keep. - - * \'any\': Does not give any guarantee of which row is kept. - This allows more optimizations. - * \'none\': Don\'t keep duplicate rows. - * \'first\': Keep first unique row. - * \'last\': Keep last unique row. - maintain_order - Keep the same order as the original DataFrame. This is more expensive to - compute. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - Returns - ------- - DataFrame - DataFrame with unique rows. - - Warnings - -------- - This method will fail if there is a column of type `List` in the DataFrame or - subset. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 1], - ... "bar": ["a", "a", "a", "a"], - ... "ham": ["b", "b", "b", "b"], - ... } - ... ) - >>> df.unique(maintain_order=True) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> df.unique(subset=["bar", "ham"], maintain_order=True) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> df.unique(keep="last", maintain_order=True) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - - ''' - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: - ''' - Return the number of unique rows, or the number of unique row-subsets. - - Parameters - ---------- - subset - One or more columns/expressions that define what to count; - omit to return the count of unique rows. - - Notes - ----- - This method operates at the `DataFrame` level; to operate on subsets at the - expression level you can make use of struct-packing instead, for example: - - >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() - - If instead you want to count the number of unique values per-column, you can - also use expression-level syntax to return a new frame containing that result: - - >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) - >>> df_nunique = df.select(pl.all().n_unique()) - - In aggregate context there is also an equivalent method for returning the - unique values per-group: - - >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 1, 2, 3, 4, 5], - ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], - ... "c": [True, True, True, False, True, True], - ... } - ... ) - >>> df.n_unique() - 5 - - Simple columns subset. - - >>> df.n_unique(subset=["b", "c"]) - 4 - - Expression subset. - - >>> df.n_unique( - ... subset=[ - ... (pl.col("a") // 2), - ... (pl.col("c") | (pl.col("b") >= 2)), - ... ], - ... ) - 3 - - ''' - def approx_n_unique(self) -> DataFrame: - ''' - Approximate count of unique values. - - This is done using the HyperLogLog++ algorithm for cardinality estimation. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> df.approx_n_unique() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def approx_unique(self) -> DataFrame: - """ - Approximate count of unique values. - - .. deprecated:: 0.18.12 - This method has been renamed to :func:`DataFrame.approx_n_unique`. - - """ - def rechunk(self) -> Self: - """ - Rechunk the data in this DataFrame to a contiguous allocation. - - This will make sure all subsequent operations have optimal and predictable - performance. - """ - def null_count(self) -> Self: - ''' - Create a new DataFrame that shows the null counts per column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, None, 3], - ... "bar": [6, 7, None], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.null_count() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ u32 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 1 ┆ 0 │ - └─────┴─────┴─────┘ - - ''' - def sample(self, n: int | Series | None = ...) -> Self: - ''' - Sample from this DataFrame. - - Parameters - ---------- - n - Number of items to return. Cannot be used with `fraction`. Defaults to 1 if - `fraction` is None. - fraction - Fraction of items to return. Cannot be used with `n`. - with_replacement - Allow values to be sampled more than once. - shuffle - If set to True, the order of the sampled rows will be shuffled. If - set to False (default), the order of the returned rows will be - neither stable nor fully random. - seed - Seed for the random number generator. If set to None (default), a - random seed is generated for each sample operation. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8 ┆ c │ - │ 2 ┆ 7 ┆ b │ - └─────┴─────┴─────┘ - - ''' - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: - ''' - Apply a horizontal reduction on a DataFrame. - - This can be used to effectively determine aggregations on a row level, and can - be applied to any DataType that can be supercasted (casted to a similar parent - type). - - An example of the supercast rules when applying an arithmetic operation on two - DataTypes are for instance: - - - Int8 + Utf8 = Utf8 - - Float32 + Int64 = Float32 - - Float32 + Float64 = Float64 - - Examples - -------- - A horizontal sum operation: - - >>> df = pl.DataFrame( - ... { - ... "a": [2, 1, 3], - ... "b": [1, 2, 3], - ... "c": [1.0, 2.0, 3.0], - ... } - ... ) - >>> df.fold(lambda s1, s2: s1 + s2) - shape: (3,) - Series: \'a\' [f64] - [ - 4.0 - 5.0 - 9.0 - ] - - A horizontal minimum operation: - - >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) - >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) - shape: (3,) - Series: \'a\' [f64] - [ - 1.0 - 1.0 - 3.0 - ] - - A horizontal string concatenation: - - >>> df = pl.DataFrame( - ... { - ... "a": ["foo", "bar", 2], - ... "b": [1, 2, 3], - ... "c": [1.0, 2.0, 3.0], - ... } - ... ) - >>> df.fold(lambda s1, s2: s1 + s2) - shape: (3,) - Series: \'a\' [str] - [ - "foo11.0" - "bar22.0" - null - ] - - A horizontal boolean or, similar to a row-wise .any(): - - >>> df = pl.DataFrame( - ... { - ... "a": [False, False, True], - ... "b": [False, True, False], - ... } - ... ) - >>> df.fold(lambda s1, s2: s1 | s2) - shape: (3,) - Series: \'a\' [bool] - [ - false - true - true - ] - - Parameters - ---------- - operation - function that takes two `Series` and returns a `Series`. - - ''' - def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: - ''' - Get the values of a single row, either by index or by predicate. - - Parameters - ---------- - index - Row index. - by_predicate - Select the row according to a given expression/predicate. - named - Return a dictionary instead of a tuple. The dictionary is a mapping of - column name to row value. This is more expensive than returning a regular - tuple, but allows for accessing values by column name. - - Returns - ------- - tuple (default) or dictionary of row values - - Notes - ----- - The `index` and `by_predicate` params are mutually exclusive. Additionally, - to ensure clarity, the `by_predicate` parameter must be supplied by keyword. - - When using `by_predicate` it is an error condition if anything other than - one row is returned; more than one row raises `TooManyRowsReturnedError`, and - zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). - - Warnings - -------- - You should NEVER use this method to iterate over a DataFrame; if you require - row-iteration you should strongly prefer use of `iter_rows()` instead. - - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - rows : Materialise all frame data as a list of rows (potentially expensive). - item: Return dataframe element as a scalar. - - Examples - -------- - Specify an index to return the row at the given index as a tuple. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.row(2) - (3, 8, \'c\') - - Specify `named=True` to get a dictionary instead with a mapping of column - names to row values. - - >>> df.row(2, named=True) - {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} - - Use `by_predicate` to return the row that matches the given predicate. - - >>> df.row(by_predicate=(pl.col("ham") == "b")) - (2, 7, \'b\') - - ''' - def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: - ''' - Returns all data in the DataFrame as a list of rows of python-native values. - - Parameters - ---------- - named - Return dictionaries instead of tuples. The dictionaries are a mapping of - column name to row value. This is more expensive than returning a regular - tuple, but allows for accessing values by column name. - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - Warnings - -------- - Row-iteration is not optimal as the underlying data is stored in columnar form; - where possible, prefer export via one of the dedicated export/output methods. - Where possible you should also consider using `iter_rows` instead to avoid - materialising all the data at once. - - Returns - ------- - list of tuples (default) or dictionaries of row values - - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - rows_by_key : Materialises frame data as a key-indexed dictionary. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "x": ["a", "b", "b", "a"], - ... "y": [1, 2, 3, 4], - ... "z": [0, 3, 6, 9], - ... } - ... ) - >>> df.rows() - [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] - >>> df.rows(named=True) - [{\'x\': \'a\', \'y\': 1, \'z\': 0}, - {\'x\': \'b\', \'y\': 2, \'z\': 3}, - {\'x\': \'b\', \'y\': 3, \'z\': 6}, - {\'x\': \'a\', \'y\': 4, \'z\': 9}] - - ''' - def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: - ''' - Returns DataFrame data as a keyed dictionary of python-native values. - - Note that this method should not be used in place of native operations, due to - the high cost of materialising all frame data out into a dictionary; it should - be used only when you need to move the values out into a Python data structure - or other object that cannot operate directly with Polars/Arrow. - - Parameters - ---------- - key - The column(s) to use as the key for the returned dictionary. If multiple - columns are specified, the key will be a tuple of those values, otherwise - it will be a string. - named - Return dictionary rows instead of tuples, mapping column name to row value. - include_key - Include key values inline with the associated data (by default the key - values are omitted as a memory/performance optimisation, as they can be - reoconstructed from the key). - unique - Indicate that the key is unique; this will result in a 1:1 mapping from - key to a single associated row. Note that if the key is *not* actually - unique the last row with the given key will be returned. - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - See Also - -------- - rows : Materialise all frame data as a list of rows (potentially expensive). - iter_rows : Row iterator over frame data (does not materialise all rows). - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "w": ["a", "b", "b", "a"], - ... "x": ["q", "q", "q", "k"], - ... "y": [1.0, 2.5, 3.0, 4.5], - ... "z": [9, 8, 7, 6], - ... } - ... ) - - Group rows by the given key column(s): - - >>> df.rows_by_key(key=["w"]) - defaultdict(, - {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], - \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) - - Return the same row groupings as dictionaries: - - >>> df.rows_by_key(key=["w"], named=True) - defaultdict(, - {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, - {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], - \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, - {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) - - Return row groupings, assuming keys are unique: - - >>> df.rows_by_key(key=["z"], unique=True) - {9: (\'a\', \'q\', 1.0), - 8: (\'b\', \'q\', 2.5), - 7: (\'b\', \'q\', 3.0), - 6: (\'a\', \'k\', 4.5)} - - Return row groupings as dictionaries, assuming keys are unique: - - >>> df.rows_by_key(key=["z"], named=True, unique=True) - {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, - 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, - 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, - 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} - - Return dictionary rows grouped by a compound key, including key values: - - >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) - defaultdict(, - {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], - (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, - {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], - (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) - - ''' - def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: - ''' - Returns an iterator over the DataFrame of rows of python-native values. - - Parameters - ---------- - named - Return dictionaries instead of tuples. The dictionaries are a mapping of - column name to row value. This is more expensive than returning a regular - tuple, but allows for accessing values by column name. - buffer_size - Determines the number of rows that are buffered internally while iterating - over the data; you should only modify this in very specific cases where the - default value is determined not to be a good fit to your access pattern, as - the speedup from using the buffer is significant (~2-4x). Setting this - value to zero disables row buffering (not recommended). - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - Warnings - -------- - Row iteration is not optimal as the underlying data is stored in columnar form; - where possible, prefer export via one of the dedicated export/output methods - that deals with columnar data. - - Returns - ------- - iterator of tuples (default) or dictionaries (if named) of python row values - - See Also - -------- - rows : Materialises all frame data as a list of rows (potentially expensive). - rows_by_key : Materialises frame data as a key-indexed dictionary. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> [row[0] for row in df.iter_rows()] - [1, 3, 5] - >>> [row["b"] for row in df.iter_rows(named=True)] - [2, 4, 6] - - ''' - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: - ''' - Returns a non-copying iterator of slices over the underlying DataFrame. - - Parameters - ---------- - n_rows - Determines the number of rows contained in each DataFrame slice. - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... data={ - ... "a": range(17_500), - ... "b": date(2023, 1, 1), - ... "c": "klmnoopqrstuvwxyz", - ... }, - ... schema_overrides={"a": pl.Int32}, - ... ) - >>> for idx, frame in enumerate(df.iter_slices()): - ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") - ... - DataFrame:[0]:10000 - DataFrame:[1]:7500 - - Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and - any supported frame export/conversion types; for example, as RecordBatches: - - >>> for frame in df.iter_slices(n_rows=15_000): - ... record_batch = frame.to_arrow().to_batches()[0] - ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") - ... - a: int32 - b: date32[day] - c: large_string - << 15000 - a: int32 - b: date32[day] - c: large_string - << 2500 - - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - partition_by : Split into multiple DataFrames, partitioned by groups. - - ''' - def shrink_to_fit(self) -> Self: - """ - Shrink DataFrame memory usage. - - Shrinks to fit the exact capacity needed to hold the data. - - """ - def gather_every(self, n: int) -> DataFrame: - ''' - Take every nth row in the DataFrame and return as a new DataFrame. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) - >>> s.gather_every(2) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 5 │ - │ 3 ┆ 7 │ - └─────┴─────┘ - - ''' - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: - ''' - Hash and combine the rows in this DataFrame. - - The hash value is of type `UInt64`. - - Parameters - ---------- - seed - Random seed parameter. Defaults to 0. - seed_1 - Random seed parameter. Defaults to `seed` if not set. - seed_2 - Random seed parameter. Defaults to `seed` if not set. - seed_3 - Random seed parameter. Defaults to `seed` if not set. - - Notes - ----- - This implementation of :func:`hash_rows` does not guarantee stable results - across different Polars versions. Its stability is only guaranteed within a - single version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, None, 3, 4], - ... "ham": ["a", "b", None, "d"], - ... } - ... ) - >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT - shape: (4,) - Series: \'\' [u64] - [ - 10783150408545073287 - 1438741209321515184 - 10047419486152048166 - 2047317070637311557 - ] - - ''' - def interpolate(self) -> DataFrame: - ''' - Interpolate intermediate values. The interpolation method is linear. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, None, 9, 10], - ... "bar": [6, 7, 9, None], - ... "baz": [1, None, None, 9], - ... } - ... ) - >>> df.interpolate() - shape: (4, 3) - ┌──────┬──────┬──────────┐ - │ foo ┆ bar ┆ baz │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 │ - ╞══════╪══════╪══════════╡ - │ 1.0 ┆ 6.0 ┆ 1.0 │ - │ 5.0 ┆ 7.0 ┆ 3.666667 │ - │ 9.0 ┆ 9.0 ┆ 6.333333 │ - │ 10.0 ┆ null ┆ 9.0 │ - └──────┴──────┴──────────┘ - - ''' - def is_empty(self) -> bool: - ''' - Check if the dataframe is empty. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.is_empty() - False - >>> df.filter(pl.col("foo") > 99).is_empty() - True - - ''' - def to_struct(self, name: str) -> Series: - ''' - Convert a `DataFrame` to a `Series` of type `Struct`. - - Parameters - ---------- - name - Name for the struct Series - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4, 5], - ... "b": ["one", "two", "three", "four", "five"], - ... } - ... ) - >>> df.to_struct("nums") - shape: (5,) - Series: \'nums\' [struct[2]] - [ - {1,"one"} - {2,"two"} - {3,"three"} - {4,"four"} - {5,"five"} - ] - - ''' - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: - ''' - Decompose struct columns into separate columns for each of their fields. - - The new columns will be inserted into the dataframe at the location of the - struct column. - - Parameters - ---------- - columns - Name of the struct column(s) that should be unnested. - *more_columns - Additional columns to unnest, specified as positional arguments. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "before": ["foo", "bar"], - ... "t_a": [1, 2], - ... "t_b": ["a", "b"], - ... "t_c": [True, None], - ... "t_d": [[1, 2], [3]], - ... "after": ["baz", "womp"], - ... } - ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") - >>> df - shape: (2, 3) - ┌────────┬─────────────────────┬───────┐ - │ before ┆ t_struct ┆ after │ - │ --- ┆ --- ┆ --- │ - │ str ┆ struct[4] ┆ str │ - ╞════════╪═════════════════════╪═══════╡ - │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ - │ bar ┆ {2,"b",null,[3]} ┆ womp │ - └────────┴─────────────────────┴───────┘ - >>> df.unnest("t_struct") - shape: (2, 6) - ┌────────┬─────┬─────┬──────┬───────────┬───────┐ - │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ - ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ - │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ - │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ - └────────┴─────┴─────┴──────┴───────────┴───────┘ - - ''' - def corr(self, **kwargs: Any) -> DataFrame: - ''' - Return pairwise Pearson product-moment correlation coefficients between columns. - - See numpy `corrcoef` for more information: - https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html - - Notes - ----- - This functionality requires numpy to be installed. - - Parameters - ---------- - **kwargs - Keyword arguments are passed to numpy `corrcoef`. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) - >>> df.corr() - shape: (3, 3) - ┌──────┬──────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 │ - ╞══════╪══════╪══════╡ - │ 1.0 ┆ -1.0 ┆ 1.0 │ - │ -1.0 ┆ 1.0 ┆ -1.0 │ - │ 1.0 ┆ -1.0 ┆ 1.0 │ - └──────┴──────┴──────┘ - - ''' - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: - ''' - Take two sorted DataFrames and merge them by the sorted key. - - The output of this operation will also be sorted. - It is the callers responsibility that the frames are sorted - by that key otherwise the output will not make sense. - - The schemas of both DataFrames must be equal. - - Parameters - ---------- - other - Other DataFrame that must be merged - key - Key that is sorted. - - Examples - -------- - >>> df0 = pl.DataFrame( - ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} - ... ).sort("age") - >>> df0 - shape: (3, 2) - ┌───────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═══════╪═════╡ - │ bob ┆ 18 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └───────┴─────┘ - >>> df1 = pl.DataFrame( - ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} - ... ).sort("age") - >>> df1 - shape: (4, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - └────────┴─────┘ - >>> df0.merge_sorted(df1, key="age") - shape: (7, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ bob ┆ 18 │ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └────────┴─────┘ - ''' - def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: - """ - Indicate that one or multiple columns are sorted. - - Parameters - ---------- - column - Columns that are sorted - more_columns - Additional columns that are sorted, specified as positional arguments. - descending - Whether the columns are sorted in descending order. - """ - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: - ''' - Update the values in this `DataFrame` with the values in `other`. - - By default, null values in the right dataframe are ignored. Use - `ignore_nulls=False` to overwrite values in this frame with null values in other - frame. - - Notes - ----- - This is syntactic sugar for a left/inner join, with an optional coalesce when - `include_nulls = False`. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Parameters - ---------- - other - DataFrame that will be used to update the values - on - Column names that will be joined on. - If none given the row count is used. - left_on - Join column(s) of the left DataFrame. - right_on - Join column(s) of the right DataFrame. - how : {\'left\', \'inner\', \'outer\'} - * \'left\' will keep all rows from the left table; rows may be duplicated - if multiple rows in the right frame match the left row\'s key. - * \'inner\' keeps only those rows where the key exists in both frames. - * \'outer\' will update existing rows where the key matches while also - adding any new rows contained in the given frame. - include_nulls - If True, null values from the right dataframe will be used to update the - left dataframe. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4], - ... "B": [400, 500, 600, 700], - ... } - ... ) - >>> df - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 400 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - >>> new_df = pl.DataFrame( - ... { - ... "B": [-66, None, -99], - ... "C": [5, 3, 1], - ... } - ... ) - - Update `df` values with the non-null values in `new_df`, by row index: - - >>> df.update(new_df) - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, by row index, - but only keeping those rows that are common to both frames: - - >>> df.update(new_df, how="inner") - shape: (3, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") - shape: (5, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴─────┘ - - Update `df` values including null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> df.update( - ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True - ... ) - shape: (5, 2) - ┌─────┬──────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ null │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴──────┘ - - ''' - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: - """ - Start a group by operation. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.group_by`. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - .. note:: - Within each group, the order of rows is always preserved, regardless - of this argument. - - Returns - ------- - GroupBy - Object which can be used to perform aggregations. - - """ - def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - """ - def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.9 - This method has been renamed to :func:`DataFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - """ - def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.group_by_dynamic`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - include_boundaries - Add the lower and upper bound of the window to the "_lower_bound" and - "_upper_bound" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - DynamicGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - ''' - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: - """ - Apply a custom/user-defined function (UDF) over the rows of the DataFrame. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.map_rows`. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output type of the operation. If none given, Polars tries to infer the type. - inference_size - Only used in the case when the custom function returns rows. - This uses the first `n` rows to determine the output schema - - """ - def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - fill None values with this value. - n - Number of places to shift (may be negative). - - """ - def take_every(self, n: int) -> DataFrame: - """ - Take every nth row in the DataFrame and return as a new DataFrame. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - def find_idx_by_name(self, name: str) -> int: - """ - Find the index of a column by name. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`get_column_index`. - - Parameters - ---------- - name - Name of the column to find. - """ - def insert_at_idx(self, index: int, column: Series) -> Self: - """ - Insert a Series at a certain column index. This operation is in place. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`insert_column`. - - Parameters - ---------- - index - Column to insert the new `Series` column. - column - `Series` to insert. - """ - def replace_at_idx(self, index: int, new_column: Series) -> Self: - """ - Replace a column at an index location. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`replace_column`. - - Parameters - ---------- - index - Column index. - new_column - Series that will replace the column. - """ - def frame_equal(self, other: DataFrame) -> bool: - """ - Check whether the DataFrame is equal to another DataFrame. - - .. deprecated:: 0.19.16 - This method has been renamed to :func:`equals`. - - Parameters - ---------- - other - DataFrame to compare with. - null_equal - Consider null values as equal. - """ - @property - def shape(self): ... - @property - def height(self): ... - @property - def width(self): ... - @property - def dtypes(self): ... - @property - def flags(self): ... - @property - def schema(self): ... -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/dataframe/frame.pyi similarity index 99% rename from polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/dataframe/frame rename to polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/dataframe/frame.pyi index 562effd..2d01627 100644 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/dataframe/frame +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/dataframe/frame.pyi @@ -1,3 +1,4 @@ +#: version 0.19.18 import P import deltalake import np as np @@ -36,7 +37,7 @@ _dtype_str_repr: builtin_function_or_method class DataFrame: _accessors: _ClassVar[set] = ... - columns: Incomplete + columns: list[str] def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... @classmethod def _from_pydf(cls, py_df: PyDataFrame) -> Self: @@ -1377,18 +1378,20 @@ class DataFrame: Parameters ---------- table_name - Name of the table to create or append to in the target SQL database. - If your table name contains special characters, it should be quoted. + Schema-qualified name of the table to create or append to in the target + SQL database. If your table name contains special characters, it should + be quoted. connection Connection URI string, for example: * "postgresql://user:pass@server:port/database" * "sqlite:////path/to/database.db" if_exists : {\'append\', \'replace\', \'fail\'} - The insert mode. - \'replace\' will create a new database table, overwriting an existing one. - \'append\' will append to an existing table. - \'fail\' will fail if table already exists. + The insert mode: + + * \'replace\' will create a new database table, overwriting an existing one. + * \'append\' will append to an existing table. + * \'fail\' will fail if table already exists. engine : {\'sqlalchemy\', \'adbc\'} Select the engine used for writing the data. ''' @@ -6221,6 +6224,60 @@ class DataFrame: >>> [row["b"] for row in df.iter_rows(named=True)] [2, 4, 6] + ''' + def iter_columns(self) -> Iterator[Series]: + ''' + Returns an iterator over the DataFrame\'s columns. + + Notes + ----- + Consider whether you can use :func:`all` instead. + If you can, it will be more efficient. + + Returns + ------- + Iterator of Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [s.name for s in df.iter_columns()] + [\'a\', \'b\'] + + If you\'re using this to modify a dataframe\'s columns, e.g. + + >>> # Do NOT do this + >>> pl.DataFrame(column * 2 for column in df.iter_columns()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + + then consider whether you can use :func:`all` instead: + + >>> df.select(pl.all() * 2) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + ''' def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ''' diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/expr/expr deleted file mode 100644 index 5131d44..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/expr/expr +++ /dev/null @@ -1,8289 +0,0 @@ -import P -import np as np -import pl -from builtins import PyExpr -from datetime import timedelta -from polars.datatypes.classes import Categorical as Categorical, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 -from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy -from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import no_default as no_default, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence - -TYPE_CHECKING: bool -py_arg_where: builtin_function_or_method -pyreduce: builtin_function_or_method - -class Expr: - _pyexpr: _ClassVar[None] = ... - _accessors: _ClassVar[set] = ... - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _repr_html_(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int | bool) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int | bool) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr | int | bool) -> Self: ... - def __rxor__(self, other: Any) -> Self: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: - """Numpy universal functions.""" - @classmethod - def from_json(cls, value: str) -> Self: - """ - Read an expression from a JSON encoded string to construct an Expression. - - Parameters - ---------- - value - JSON encoded string value - - """ - def to_physical(self) -> Self: - ''' - Cast to physical representation of the logical dtype. - - - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` - - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` - - `List(inner)` -> `List(physical of inner)` - - Other data types will be left unchanged. - - Examples - -------- - Replicating the pandas - `pd.factorize - `_ - function. - - >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( - ... [ - ... pl.col("vals").cast(pl.Categorical), - ... pl.col("vals") - ... .cast(pl.Categorical) - ... .to_physical() - ... .alias("vals_physical"), - ... ] - ... ) - shape: (4, 2) - ┌──────┬───────────────┐ - │ vals ┆ vals_physical │ - │ --- ┆ --- │ - │ cat ┆ u32 │ - ╞══════╪═══════════════╡ - │ a ┆ 0 │ - │ x ┆ 1 │ - │ null ┆ null │ - │ a ┆ 0 │ - └──────┴───────────────┘ - - ''' - def any(self) -> Self: - ''' - Return whether any of the values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is null. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [True, False], - ... "b": [False, False], - ... "c": [None, False], - ... } - ... ) - >>> df.select(pl.col("*").any()) - shape: (1, 3) - ┌──────┬───────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪═══════╡ - │ true ┆ false ┆ false │ - └──────┴───────┴───────┘ - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> df.select(pl.col("*").any(ignore_nulls=False)) - shape: (1, 3) - ┌──────┬───────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪══════╡ - │ true ┆ false ┆ null │ - └──────┴───────┴──────┘ - - ''' - def all(self) -> Self: - ''' - Return whether all values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - .. note:: - This method is not to be confused with the function :func:`polars.all`, - which can be used to select all columns. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is null. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [True, True], - ... "b": [False, True], - ... "c": [None, True], - ... } - ... ) - >>> df.select(pl.col("*").all()) - shape: (1, 3) - ┌──────┬───────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪══════╡ - │ true ┆ false ┆ true │ - └──────┴───────┴──────┘ - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> df.select(pl.col("*").all(ignore_nulls=False)) - shape: (1, 3) - ┌──────┬───────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪══════╡ - │ true ┆ false ┆ null │ - └──────┴───────┴──────┘ - - ''' - def arg_true(self) -> Self: - ''' - Return indices where expression evaluates `True`. - - .. warning:: - Modifies number of rows returned, so will fail in combination with other - expressions. Use as only expression in `select` / `with_columns`. - - See Also - -------- - Series.arg_true : Return indices where Series is True - polars.arg_where - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) - >>> df.select((pl.col("a") == 1).arg_true()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 0 │ - │ 1 │ - │ 3 │ - └─────┘ - - ''' - def sqrt(self) -> Self: - ''' - Compute the square root of the elements. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").sqrt()) - shape: (3, 1) - ┌──────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.0 │ - │ 1.414214 │ - │ 2.0 │ - └──────────┘ - - ''' - def cbrt(self) -> Self: - ''' - Compute the cube root of the elements. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").cbrt()) - shape: (3, 1) - ┌──────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.0 │ - │ 1.259921 │ - │ 1.587401 │ - └──────────┘ - - ''' - def log10(self) -> Self: - ''' - Compute the base 10 logarithm of the input array, element-wise. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").log10()) - shape: (3, 1) - ┌─────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞═════════╡ - │ 0.0 │ - │ 0.30103 │ - │ 0.60206 │ - └─────────┘ - - ''' - def exp(self) -> Self: - ''' - Compute the exponential, element-wise. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").exp()) - shape: (3, 1) - ┌──────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 2.718282 │ - │ 7.389056 │ - │ 54.59815 │ - └──────────┘ - - ''' - def alias(self, name: str) -> Self: - ''' - Rename the expression. - - Parameters - ---------- - name - The new name. - - See Also - -------- - map - prefix - suffix - - Examples - -------- - Rename an expression to avoid overwriting an existing column. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["x", "y", "z"], - ... } - ... ) - >>> df.with_columns( - ... pl.col("a") + 10, - ... pl.col("b").str.to_uppercase().alias("c"), - ... ) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 11 ┆ x ┆ X │ - │ 12 ┆ y ┆ Y │ - │ 13 ┆ z ┆ Z │ - └─────┴─────┴─────┘ - - Overwrite the default name of literal columns to prevent errors due to duplicate - column names. - - >>> df.with_columns( - ... pl.lit(True).alias("c"), - ... pl.lit(4.0).alias("d"), - ... ) - shape: (3, 4) - ┌─────┬─────┬──────┬─────┐ - │ a ┆ b ┆ c ┆ d │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ bool ┆ f64 │ - ╞═════╪═════╪══════╪═════╡ - │ 1 ┆ x ┆ true ┆ 4.0 │ - │ 2 ┆ y ┆ true ┆ 4.0 │ - │ 3 ┆ z ┆ true ┆ 4.0 │ - └─────┴─────┴──────┴─────┘ - - ''' - def map_alias(self, function: Callable[[str], str]) -> Self: - ''' - Rename the output of an expression by mapping a function over the root name. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.map`. - - Parameters - ---------- - function - Function that maps a root name to a new name. - - See Also - -------- - keep_name - prefix - suffix - - Examples - -------- - Remove a common suffix and convert to lower case. - - >>> df = pl.DataFrame( - ... { - ... "A_reverse": [3, 2, 1], - ... "B_reverse": ["z", "y", "x"], - ... } - ... ) - >>> df.with_columns( - ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) - ... ) - shape: (3, 4) - ┌───────────┬───────────┬─────┬─────┐ - │ A_reverse ┆ B_reverse ┆ a ┆ b │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═══════════╪═══════════╪═════╪═════╡ - │ 3 ┆ z ┆ 1 ┆ x │ - │ 2 ┆ y ┆ 2 ┆ y │ - │ 1 ┆ x ┆ 3 ┆ z │ - └───────────┴───────────┴─────┴─────┘ - - ''' - def prefix(self, prefix: str) -> Self: - ''' - Add a prefix to the root column name of the expression. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.prefix`. - - Parameters - ---------- - prefix - Prefix to add to the root column name. - - Notes - ----- - This will undo any previous renaming operations on the expression. - - Due to implementation constraints, this method can only be called as the last - expression in a chain. - - See Also - -------- - suffix - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["x", "y", "z"], - ... } - ... ) - >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) - shape: (3, 4) - ┌─────┬─────┬───────────┬───────────┐ - │ a ┆ b ┆ reverse_a ┆ reverse_b │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪═════╪═══════════╪═══════════╡ - │ 1 ┆ x ┆ 3 ┆ z │ - │ 2 ┆ y ┆ 2 ┆ y │ - │ 3 ┆ z ┆ 1 ┆ x │ - └─────┴─────┴───────────┴───────────┘ - - ''' - def suffix(self, suffix: str) -> Self: - ''' - Add a suffix to the root column name of the expression. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.suffix`. - - Parameters - ---------- - suffix - Suffix to add to the root column name. - - Notes - ----- - This will undo any previous renaming operations on the expression. - - Due to implementation constraints, this method can only be called as the last - expression in a chain. - - See Also - -------- - prefix - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["x", "y", "z"], - ... } - ... ) - >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) - shape: (3, 4) - ┌─────┬─────┬───────────┬───────────┐ - │ a ┆ b ┆ a_reverse ┆ b_reverse │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪═════╪═══════════╪═══════════╡ - │ 1 ┆ x ┆ 3 ┆ z │ - │ 2 ┆ y ┆ 2 ┆ y │ - │ 3 ┆ z ┆ 1 ┆ x │ - └─────┴─────┴───────────┴───────────┘ - - ''' - def keep_name(self) -> Self: - ''' - Keep the original root name of the expression. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.keep`. - - Notes - ----- - Due to implementation constraints, this method can only be called as the last - expression in a chain. - - See Also - -------- - alias - - Examples - -------- - Undo an alias operation. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2], - ... "b": [3, 4], - ... } - ... ) - >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 9 ┆ 3 │ - │ 18 ┆ 4 │ - └─────┴─────┘ - - Prevent errors due to duplicate column names. - - >>> df.select((pl.lit(10) / pl.all()).name.keep()) - shape: (2, 2) - ┌──────┬──────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪══════════╡ - │ 10.0 ┆ 3.333333 │ - │ 5.0 ┆ 2.5 │ - └──────┴──────────┘ - - ''' - def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: - ''' - Exclude columns from a multi-column expression. - - Only works after a wildcard or regex column selection, and you cannot provide - both string column names *and* dtypes (you may prefer to use selectors instead). - - Parameters - ---------- - columns - The name or datatype of the column(s) to exclude. Accepts regular expression - input. Regular expressions should start with `^` and end with `$`. - *more_columns - Additional names or datatypes of columns to exclude, specified as positional - arguments. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "aa": [1, 2, 3], - ... "ba": ["a", "b", None], - ... "cc": [None, 2.5, 1.5], - ... } - ... ) - >>> df - shape: (3, 3) - ┌─────┬──────┬──────┐ - │ aa ┆ ba ┆ cc │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ f64 │ - ╞═════╪══════╪══════╡ - │ 1 ┆ a ┆ null │ - │ 2 ┆ b ┆ 2.5 │ - │ 3 ┆ null ┆ 1.5 │ - └─────┴──────┴──────┘ - - Exclude by column name(s): - - >>> df.select(pl.all().exclude("ba")) - shape: (3, 2) - ┌─────┬──────┐ - │ aa ┆ cc │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ null │ - │ 2 ┆ 2.5 │ - │ 3 ┆ 1.5 │ - └─────┴──────┘ - - Exclude by regex, e.g. removing all columns whose names end with the letter "a": - - >>> df.select(pl.all().exclude("^.*a$")) - shape: (3, 1) - ┌──────┐ - │ cc │ - │ --- │ - │ f64 │ - ╞══════╡ - │ null │ - │ 2.5 │ - │ 1.5 │ - └──────┘ - - Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: - - >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) - shape: (3, 1) - ┌──────┐ - │ ba │ - │ --- │ - │ str │ - ╞══════╡ - │ a │ - │ b │ - │ null │ - └──────┘ - - ''' - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: - ''' - Offers a structured way to apply a sequence of user-defined functions (UDFs). - - Parameters - ---------- - function - Callable; will receive the expression as the first parameter, - followed by any given args/kwargs. - *args - Arguments to pass to the UDF. - **kwargs - Keyword arguments to pass to the UDF. - - Examples - -------- - >>> def extract_number(expr: pl.Expr) -> pl.Expr: - ... """Extract the digits from a string.""" - ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) - >>> - >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: - ... """Set even numbers negative, and scale by a user-supplied value.""" - ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) - ... return expr * n - >>> - >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) - >>> df.with_columns( - ... udfs=( - ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) - ... ), - ... ) - shape: (4, 2) - ┌──────┬──────┐ - │ val ┆ udfs │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞══════╪══════╡ - │ a: 1 ┆ 5 │ - │ b: 2 ┆ -10 │ - │ c: 3 ┆ 15 │ - │ d: 4 ┆ -20 │ - └──────┴──────┘ - - ''' - def is_not(self) -> Self: - """ - Negate a boolean expression. - - .. deprecated:: 0.19.2 - This method has been renamed to :func:`Expr.not_`. - - """ - def not_(self) -> Self: - ''' - Negate a boolean expression. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [True, False, False], - ... "b": ["a", "b", None], - ... } - ... ) - >>> df - shape: (3, 2) - ┌───────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ bool ┆ str │ - ╞═══════╪══════╡ - │ true ┆ a │ - │ false ┆ b │ - │ false ┆ null │ - └───────┴──────┘ - >>> df.select(pl.col("a").not_()) - shape: (3, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ true │ - │ true │ - └───────┘ - - ''' - def is_null(self) -> Self: - ''' - Returns a boolean Series indicating which values are null. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null - shape: (5, 4) - ┌──────┬─────┬──────────┬──────────┐ - │ a ┆ b ┆ a_isnull ┆ b_isnull │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪═════╪══════════╪══════════╡ - │ 1 ┆ 1.0 ┆ false ┆ false │ - │ 2 ┆ 2.0 ┆ false ┆ false │ - │ null ┆ NaN ┆ true ┆ false │ - │ 1 ┆ 1.0 ┆ false ┆ false │ - │ 5 ┆ 5.0 ┆ false ┆ false │ - └──────┴─────┴──────────┴──────────┘ - - ''' - def is_not_null(self) -> Self: - ''' - Returns a boolean Series indicating which values are not null. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns( - ... pl.all().is_not_null().name.suffix("_not_null") # nan != null - ... ) - shape: (5, 4) - ┌──────┬─────┬────────────┬────────────┐ - │ a ┆ b ┆ a_not_null ┆ b_not_null │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪═════╪════════════╪════════════╡ - │ 1 ┆ 1.0 ┆ true ┆ true │ - │ 2 ┆ 2.0 ┆ true ┆ true │ - │ null ┆ NaN ┆ false ┆ true │ - │ 1 ┆ 1.0 ┆ true ┆ true │ - │ 5 ┆ 5.0 ┆ true ┆ true │ - └──────┴─────┴────────────┴────────────┘ - - ''' - def is_finite(self) -> Self: - ''' - Returns a boolean Series indicating which values are finite. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1.0, 2], - ... "B": [3.0, float("inf")], - ... } - ... ) - >>> df.select(pl.all().is_finite()) - shape: (2, 2) - ┌──────┬───────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ bool ┆ bool │ - ╞══════╪═══════╡ - │ true ┆ true │ - │ true ┆ false │ - └──────┴───────┘ - - ''' - def is_infinite(self) -> Self: - ''' - Returns a boolean Series indicating which values are infinite. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1.0, 2], - ... "B": [3.0, float("inf")], - ... } - ... ) - >>> df.select(pl.all().is_infinite()) - shape: (2, 2) - ┌───────┬───────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ bool ┆ bool │ - ╞═══════╪═══════╡ - │ false ┆ false │ - │ false ┆ true │ - └───────┴───────┘ - - ''' - def is_nan(self) -> Self: - ''' - Returns a boolean Series indicating which values are NaN. - - Notes - ----- - Floating point `NaN` (Not A Number) should not be confused - with missing data represented as `Null/None`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) - shape: (5, 3) - ┌──────┬─────┬─────────┐ - │ a ┆ b ┆ b_isnan │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪═════╪═════════╡ - │ 1 ┆ 1.0 ┆ false │ - │ 2 ┆ 2.0 ┆ false │ - │ null ┆ NaN ┆ true │ - │ 1 ┆ 1.0 ┆ false │ - │ 5 ┆ 5.0 ┆ false │ - └──────┴─────┴─────────┘ - - ''' - def is_not_nan(self) -> Self: - ''' - Returns a boolean Series indicating which values are not NaN. - - Notes - ----- - Floating point `NaN` (Not A Number) should not be confused - with missing data represented as `Null/None`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) - shape: (5, 3) - ┌──────┬─────┬──────────────┐ - │ a ┆ b ┆ b_is_not_nan │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪═════╪══════════════╡ - │ 1 ┆ 1.0 ┆ true │ - │ 2 ┆ 2.0 ┆ true │ - │ null ┆ NaN ┆ false │ - │ 1 ┆ 1.0 ┆ true │ - │ 5 ┆ 5.0 ┆ true │ - └──────┴─────┴──────────────┘ - - ''' - def agg_groups(self) -> Self: - ''' - Get the group indexes of the group by operation. - - Should be used in aggregation context only. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": [ - ... "one", - ... "one", - ... "one", - ... "two", - ... "two", - ... "two", - ... ], - ... "value": [94, 95, 96, 97, 97, 99], - ... } - ... ) - >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ list[u32] │ - ╞═══════╪═══════════╡ - │ one ┆ [0, 1, 2] │ - │ two ┆ [3, 4, 5] │ - └───────┴───────────┘ - - ''' - def count(self) -> Self: - ''' - Return the number of elements in the column. - - .. warning:: - Null values are treated like regular elements in this context. - - Examples - -------- - >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) - >>> df.select(pl.all().count()) - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 3 ┆ 3 │ - └─────┴─────┘ - - ''' - def len(self) -> Self: - ''' - Return the number of elements in the column. - - Null values are treated like regular elements in this context. - - Alias for :func:`count`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) - >>> df.select(pl.all().len()) - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 3 ┆ 3 │ - └─────┴─────┘ - - ''' - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: - ''' - Get a slice of this expression. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [8, 9, 10, 11], - ... "b": [None, 4, 4, 4], - ... } - ... ) - >>> df.select(pl.all().slice(1, 2)) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 9 ┆ 4 │ - │ 10 ┆ 4 │ - └─────┴─────┘ - - ''' - def append(self, other: IntoExpr) -> Self: - ''' - Append expressions. - - This is done by adding the chunks of `other` to this `Series`. - - Parameters - ---------- - other - Expression to append. - upcast - Cast both `Series` to the same supertype. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [8, 9, 10], - ... "b": [None, 4, 4], - ... } - ... ) - >>> df.select(pl.all().head(1).append(pl.all().tail(1))) - shape: (2, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 8 ┆ null │ - │ 10 ┆ 4 │ - └─────┴──────┘ - - ''' - def rechunk(self) -> Self: - ''' - Create a single chunk of memory for this Series. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - - Create a Series with 3 nulls, append column a then rechunk - - >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) - shape: (6, 1) - ┌────────┐ - │ repeat │ - │ --- │ - │ i64 │ - ╞════════╡ - │ null │ - │ null │ - │ null │ - │ 1 │ - │ 1 │ - │ 2 │ - └────────┘ - - ''' - def drop_nulls(self) -> Self: - ''' - Drop all null values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nans - - Notes - ----- - A null value is not the same as a NaN value. - To drop NaN values, use :func:`drop_nans`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) - >>> df.select(pl.col("a").drop_nulls()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - │ 3.0 │ - │ NaN │ - └─────┘ - - ''' - def drop_nans(self) -> Self: - ''' - Drop all floating point NaN values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nulls - - Notes - ----- - A NaN value is not the same as a null value. - To drop null values, use :func:`drop_nulls`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) - >>> df.select(pl.col("a").drop_nans()) - shape: (3, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ 1.0 │ - │ null │ - │ 3.0 │ - └──────┘ - - ''' - def cum_sum(self) -> Self: - ''' - Get an array with the cumulative sum computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_sum().alias("cum_sum"), - ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), - ... ) - shape: (4, 3) - ┌─────┬─────────┬─────────────────┐ - │ a ┆ cum_sum ┆ cum_sum_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════════╪═════════════════╡ - │ 1 ┆ 1 ┆ 10 │ - │ 2 ┆ 3 ┆ 9 │ - │ 3 ┆ 6 ┆ 7 │ - │ 4 ┆ 10 ┆ 4 │ - └─────┴─────────┴─────────────────┘ - - Null values are excluded, but can also be filled by calling `forward_fill`. - - >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) - >>> df.with_columns( - ... pl.col("values").cum_sum().alias("value_cum_sum"), - ... pl.col("values") - ... .cum_sum() - ... .forward_fill() - ... .alias("value_cum_sum_all_filled"), - ... ) - shape: (8, 3) - ┌────────┬───────────────┬──────────────────────────┐ - │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞════════╪═══════════════╪══════════════════════════╡ - │ null ┆ null ┆ null │ - │ 10 ┆ 10 ┆ 10 │ - │ null ┆ null ┆ 10 │ - │ 8 ┆ 18 ┆ 18 │ - │ 9 ┆ 27 ┆ 27 │ - │ null ┆ null ┆ 27 │ - │ 16 ┆ 43 ┆ 43 │ - │ null ┆ null ┆ 43 │ - └────────┴───────────────┴──────────────────────────┘ - - ''' - def cum_prod(self) -> Self: - ''' - Get an array with the cumulative product computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_prod().alias("cum_prod"), - ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), - ... ) - shape: (4, 3) - ┌─────┬──────────┬──────────────────┐ - │ a ┆ cum_prod ┆ cum_prod_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪══════════╪══════════════════╡ - │ 1 ┆ 1 ┆ 24 │ - │ 2 ┆ 2 ┆ 24 │ - │ 3 ┆ 6 ┆ 12 │ - │ 4 ┆ 24 ┆ 4 │ - └─────┴──────────┴──────────────────┘ - - ''' - def cum_min(self) -> Self: - ''' - Get an array with the cumulative min computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_min().alias("cum_min"), - ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), - ... ) - shape: (4, 3) - ┌─────┬─────────┬─────────────────┐ - │ a ┆ cum_min ┆ cum_min_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════════╪═════════════════╡ - │ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 1 ┆ 2 │ - │ 3 ┆ 1 ┆ 3 │ - │ 4 ┆ 1 ┆ 4 │ - └─────┴─────────┴─────────────────┘ - - ''' - def cum_max(self) -> Self: - ''' - Get an array with the cumulative max computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_max().alias("cum_max"), - ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), - ... ) - shape: (4, 3) - ┌─────┬─────────┬─────────────────┐ - │ a ┆ cum_max ┆ cum_max_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════════╪═════════════════╡ - │ 1 ┆ 1 ┆ 4 │ - │ 2 ┆ 2 ┆ 4 │ - │ 3 ┆ 3 ┆ 4 │ - │ 4 ┆ 4 ┆ 4 │ - └─────┴─────────┴─────────────────┘ - - Null values are excluded, but can also be filled by calling `forward_fill`. - - >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) - >>> df.with_columns( - ... pl.col("values").cum_max().alias("cum_max"), - ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), - ... ) - shape: (8, 3) - ┌────────┬─────────┬────────────────────┐ - │ values ┆ cum_max ┆ cum_max_all_filled │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞════════╪═════════╪════════════════════╡ - │ null ┆ null ┆ null │ - │ 10 ┆ 10 ┆ 10 │ - │ null ┆ null ┆ 10 │ - │ 8 ┆ 10 ┆ 10 │ - │ 9 ┆ 10 ┆ 10 │ - │ null ┆ null ┆ 10 │ - │ 16 ┆ 16 ┆ 16 │ - │ null ┆ null ┆ 16 │ - └────────┴─────────┴────────────────────┘ - - ''' - def cum_count(self) -> Self: - ''' - Get an array with the cumulative count computed at every element. - - Counting from 0 to len - - Parameters - ---------- - reverse - Reverse the operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_count().alias("cum_count"), - ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), - ... ) - shape: (4, 3) - ┌─────┬───────────┬───────────────────┐ - │ a ┆ cum_count ┆ cum_count_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ u32 ┆ u32 │ - ╞═════╪═══════════╪═══════════════════╡ - │ 1 ┆ 0 ┆ 3 │ - │ 2 ┆ 1 ┆ 2 │ - │ 3 ┆ 2 ┆ 1 │ - │ 4 ┆ 3 ┆ 0 │ - └─────┴───────────┴───────────────────┘ - - ''' - def floor(self) -> Self: - ''' - Rounds down to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) - >>> df.select(pl.col("a").floor()) - shape: (4, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - │ 0.0 │ - │ 1.0 │ - │ 1.0 │ - └─────┘ - - ''' - def ceil(self) -> Self: - ''' - Rounds up to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) - >>> df.select(pl.col("a").ceil()) - shape: (4, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - │ 1.0 │ - │ 1.0 │ - │ 2.0 │ - └─────┘ - - ''' - def round(self, decimals: int = ...) -> Self: - ''' - Round underlying floating point data by `decimals` digits. - - Parameters - ---------- - decimals - Number of decimals to round by. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) - >>> df.select(pl.col("a").round(1)) - shape: (4, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.3 │ - │ 0.5 │ - │ 1.0 │ - │ 1.2 │ - └─────┘ - - ''' - def round_sig_figs(self, digits: int) -> Self: - ''' - Round to a number of significant figures. - - Parameters - ---------- - digits - Number of significant figures to round to. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) - >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) - shape: (3, 2) - ┌─────────┬────────────────┐ - │ a ┆ round_sig_figs │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════════╪════════════════╡ - │ 0.01234 ┆ 0.012 │ - │ 3.333 ┆ 3.3 │ - │ 1234.0 ┆ 1200.0 │ - └─────────┴────────────────┘ - - ''' - def dot(self, other: Expr | str) -> Self: - ''' - Compute the dot/inner product between two Expressions. - - Parameters - ---------- - other - Expression to compute dot product with. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> df.select(pl.col("a").dot(pl.col("b"))) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 44 │ - └─────┘ - - ''' - def mode(self) -> Self: - ''' - Compute the most occurring value(s). - - Can return multiple Values. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 1, 2, 3], - ... "b": [1, 1, 2, 2], - ... } - ... ) - >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 1 │ - │ 1 ┆ 2 │ - └─────┴─────┘ - - ''' - def cast(self, dtype: PolarsDataType | type[Any]) -> Self: - ''' - Cast between data types. - - Parameters - ---------- - dtype - DataType to cast to. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["4", "5", "6"], - ... } - ... ) - >>> df.with_columns( - ... [ - ... pl.col("a").cast(pl.Float64), - ... pl.col("b").cast(pl.Int32), - ... ] - ... ) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ i32 │ - ╞═════╪═════╡ - │ 1.0 ┆ 4 │ - │ 2.0 ┆ 5 │ - │ 3.0 ┆ 6 │ - └─────┴─────┘ - - ''' - def sort(self) -> Self: - ''' - Sort this column. - - When used in a projection/selection context, the whole column is sorted. - When used in a group by context, the groups are sorted. - - Parameters - ---------- - descending - Sort in descending order. - nulls_last - Place null values last. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, None, 3, 2], - ... } - ... ) - >>> df.select(pl.col("a").sort()) - shape: (4, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ null │ - │ 1 │ - │ 2 │ - │ 3 │ - └──────┘ - >>> df.select(pl.col("a").sort(descending=True)) - shape: (4, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ null │ - │ 3 │ - │ 2 │ - │ 1 │ - └──────┘ - >>> df.select(pl.col("a").sort(nulls_last=True)) - shape: (4, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ 1 │ - │ 2 │ - │ 3 │ - │ null │ - └──────┘ - - When sorting in a group by context, the groups are sorted. - - >>> df = pl.DataFrame( - ... { - ... "group": ["one", "one", "one", "two", "two", "two"], - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT - shape: (2, 2) - ┌───────┬────────────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪════════════╡ - │ two ┆ [3, 4, 99] │ - │ one ┆ [1, 2, 98] │ - └───────┴────────────┘ - - ''' - def top_k(self, k: int | IntoExprColumn = ...) -> Self: - ''' - Return the `k` largest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - bottom_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("value").top_k().alias("top_k"), - ... pl.col("value").bottom_k().alias("bottom_k"), - ... ] - ... ) - shape: (5, 2) - ┌───────┬──────────┐ - │ top_k ┆ bottom_k │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═══════╪══════════╡ - │ 99 ┆ 1 │ - │ 98 ┆ 2 │ - │ 4 ┆ 3 │ - │ 3 ┆ 4 │ - │ 2 ┆ 98 │ - └───────┴──────────┘ - - ''' - def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: - ''' - Return the `k` smallest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - top_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("value").top_k().alias("top_k"), - ... pl.col("value").bottom_k().alias("bottom_k"), - ... ] - ... ) - shape: (5, 2) - ┌───────┬──────────┐ - │ top_k ┆ bottom_k │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═══════╪══════════╡ - │ 99 ┆ 1 │ - │ 98 ┆ 2 │ - │ 4 ┆ 3 │ - │ 3 ┆ 4 │ - │ 2 ┆ 98 │ - └───────┴──────────┘ - - ''' - def arg_sort(self) -> Self: - ''' - Get the index values that would sort this column. - - Parameters - ---------- - descending - Sort in descending (descending) order. - nulls_last - Place null values last instead of first. - - Returns - ------- - Expr - Expression of data type :class:`UInt32`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [20, 10, 30], - ... } - ... ) - >>> df.select(pl.col("a").arg_sort()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 1 │ - │ 0 │ - │ 2 │ - └─────┘ - - ''' - def arg_max(self) -> Self: - ''' - Get the index of the maximal value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [20, 10, 30], - ... } - ... ) - >>> df.select(pl.col("a").arg_max()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def arg_min(self) -> Self: - ''' - Get the index of the minimal value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [20, 10, 30], - ... } - ... ) - >>> df.select(pl.col("a").arg_min()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 1 │ - └─────┘ - - ''' - def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: - ''' - Find indices where elements should be inserted to maintain order. - - .. math:: a[i-1] < v <= a[i] - - Parameters - ---------- - element - Expression or scalar value. - side : {\'any\', \'left\', \'right\'} - If \'any\', the index of the first suitable location found is given. - If \'left\', the index of the leftmost suitable location found is given. - If \'right\', return the rightmost suitable location found is given. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "values": [1, 2, 3, 5], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("values").search_sorted(0).alias("zero"), - ... pl.col("values").search_sorted(3).alias("three"), - ... pl.col("values").search_sorted(6).alias("six"), - ... ] - ... ) - shape: (1, 3) - ┌──────┬───────┬─────┐ - │ zero ┆ three ┆ six │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ u32 │ - ╞══════╪═══════╪═════╡ - │ 0 ┆ 2 ┆ 4 │ - └──────┴───────┴─────┘ - - ''' - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: - ''' - Sort this column by the ordering of other columns. - - When used in a projection/selection context, the whole column is sorted. - When used in a group by context, the groups are sorted. - - Parameters - ---------- - by - Column(s) to sort by. Accepts expression input. Strings are parsed as column - names. - *more_by - Additional columns to sort by, specified as positional arguments. - descending - Sort in descending order. When sorting by multiple columns, can be specified - per column by passing a sequence of booleans. - - Examples - -------- - Pass a single column name to sort by that column. - - >>> df = pl.DataFrame( - ... { - ... "group": ["a", "a", "b", "b"], - ... "value1": [1, 3, 4, 2], - ... "value2": [8, 7, 6, 5], - ... } - ... ) - >>> df.select(pl.col("group").sort_by("value1")) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ a │ - │ b │ - │ a │ - │ b │ - └───────┘ - - Sorting by expressions is also supported. - - >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ b │ - │ a │ - │ a │ - │ b │ - └───────┘ - - Sort by multiple columns by passing a list of columns. - - >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ b │ - │ a │ - │ b │ - │ a │ - └───────┘ - - Or use positional arguments to sort by multiple columns in the same way. - - >>> df.select(pl.col("group").sort_by("value1", "value2")) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ a │ - │ b │ - │ a │ - │ b │ - └───────┘ - - When sorting in a group by context, the groups are sorted. - - >>> df.group_by("group").agg( - ... pl.col("value1").sort_by("value2") - ... ) # doctest: +IGNORE_RESULT - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ value1 │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪═══════════╡ - │ a ┆ [3, 1] │ - │ b ┆ [2, 4] │ - └───────┴───────────┘ - - Take a single row from each group where a column attains its minimal value - within that group. - - >>> df.group_by("group").agg( - ... pl.all().sort_by("value2").first() - ... ) # doctest: +IGNORE_RESULT - shape: (2, 3) - ┌───────┬────────┬────────┐ - │ group ┆ value1 ┆ value2 | - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 | - ╞═══════╪════════╪════════╡ - │ a ┆ 3 ┆ 7 | - │ b ┆ 2 ┆ 5 | - └───────┴────────┴────────┘ - - ''' - def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: - ''' - Take values by index. - - Parameters - ---------- - indices - An expression that leads to a UInt32 dtyped Series. - - Returns - ------- - Expr - Expression of the same data type. - - See Also - -------- - Expr.get : Take a single value - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": [ - ... "one", - ... "one", - ... "one", - ... "two", - ... "two", - ... "two", - ... ], - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.group_by("group", maintain_order=True).agg( - ... pl.col("value").gather([2, 1]) - ... ) - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪═══════════╡ - │ one ┆ [2, 98] │ - │ two ┆ [4, 99] │ - └───────┴───────────┘ - ''' - def get(self, index: int | Expr) -> Self: - ''' - Return a single value by index. - - Parameters - ---------- - index - An expression that leads to a UInt32 index. - - Returns - ------- - Expr - Expression of the same data type. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": [ - ... "one", - ... "one", - ... "one", - ... "two", - ... "two", - ... "two", - ... ], - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) - shape: (2, 2) - ┌───────┬───────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═══════╪═══════╡ - │ one ┆ 98 │ - │ two ┆ 99 │ - └───────┴───────┘ - - ''' - def shift(self, n: int | IntoExprColumn = ...) -> Self: - ''' - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns(shift=pl.col("a").shift()) - shape: (4, 2) - ┌─────┬───────┐ - │ a ┆ shift │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═══════╡ - │ 1 ┆ null │ - │ 2 ┆ 1 │ - │ 3 ┆ 2 │ - │ 4 ┆ 3 │ - └─────┴───────┘ - - Pass a negative value to shift in the opposite direction instead. - - >>> df.with_columns(shift=pl.col("a").shift(-2)) - shape: (4, 2) - ┌─────┬───────┐ - │ a ┆ shift │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═══════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - │ 3 ┆ null │ - │ 4 ┆ null │ - └─────┴───────┘ - - Specify `fill_value` to fill the resulting null values. - - >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) - shape: (4, 2) - ┌─────┬───────┐ - │ a ┆ shift │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═══════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - │ 3 ┆ 100 │ - │ 4 ┆ 100 │ - └─────┴───────┘ - - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: - ''' - Fill null values using the specified value or strategy. - - To interpolate over null values see interpolate. - See the examples below to fill nulls with an expression. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [4, None, 6], - ... } - ... ) - >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 0 │ - │ null ┆ 6 │ - └──────┴─────┘ - >>> df.with_columns(pl.col("b").fill_null(99)) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 99 │ - │ null ┆ 6 │ - └──────┴─────┘ - >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 4 │ - │ null ┆ 6 │ - └──────┴─────┘ - >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞══════╪═════╡ - │ 1 ┆ 4.0 │ - │ 2 ┆ 5.0 │ - │ null ┆ 6.0 │ - └──────┴─────┘ - >>> df.with_columns(pl.all().fill_null(pl.all().median())) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 1.0 ┆ 4.0 │ - │ 2.0 ┆ 5.0 │ - │ 1.5 ┆ 6.0 │ - └─────┴─────┘ - - ''' - def fill_nan(self, value: int | float | Expr | None) -> Self: - ''' - Fill floating point NaN value with a fill value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1.0, None, float("nan")], - ... "b": [4.0, float("nan"), 6], - ... } - ... ) - >>> df.with_columns(pl.col("b").fill_nan(0)) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪═════╡ - │ 1.0 ┆ 4.0 │ - │ null ┆ 0.0 │ - │ NaN ┆ 6.0 │ - └──────┴─────┘ - - ''' - def forward_fill(self, limit: int | None = ...) -> Self: - ''' - Fill missing values with the latest seen values. - - Parameters - ---------- - limit - The number of consecutive null values to forward fill. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [4, None, 6], - ... } - ... ) - >>> df.select(pl.all().forward_fill()) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 4 │ - │ 2 ┆ 6 │ - └─────┴─────┘ - - ''' - def backward_fill(self, limit: int | None = ...) -> Self: - ''' - Fill missing values with the next to be seen values. - - Parameters - ---------- - limit - The number of consecutive null values to backward fill. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [4, None, 6], - ... "c": [None, None, 2], - ... } - ... ) - >>> df.select(pl.all().backward_fill()) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞══════╪═════╪═════╡ - │ 1 ┆ 4 ┆ 2 │ - │ 2 ┆ 6 ┆ 2 │ - │ null ┆ 6 ┆ 2 │ - └──────┴─────┴─────┘ - >>> df.select(pl.all().backward_fill(limit=1)) - shape: (3, 3) - ┌──────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞══════╪═════╪══════╡ - │ 1 ┆ 4 ┆ null │ - │ 2 ┆ 6 ┆ 2 │ - │ null ┆ 6 ┆ 2 │ - └──────┴─────┴──────┘ - - ''' - def reverse(self) -> Self: - ''' - Reverse the selection. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4, 5], - ... "fruits": ["banana", "banana", "apple", "apple", "banana"], - ... "B": [5, 4, 3, 2, 1], - ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], - ... } - ... ) - >>> df.select( - ... [ - ... pl.all(), - ... pl.all().reverse().name.suffix("_reverse"), - ... ] - ... ) - shape: (5, 8) - ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ - │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ - │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ - │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ - │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ - │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ - │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ - └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ - - ''' - def std(self, ddof: int = ...) -> Self: - ''' - Get standard deviation. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").std()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def var(self, ddof: int = ...) -> Self: - ''' - Get variance. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").var()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def max(self) -> Self: - ''' - Get maximum value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) - >>> df.select(pl.col("a").max()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def min(self) -> Self: - ''' - Get minimum value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) - >>> df.select(pl.col("a").min()) - shape: (1, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ -1.0 │ - └──────┘ - - ''' - def nan_max(self) -> Self: - ''' - Get maximum value, but propagate/poison encountered NaN values. - - This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0, float("nan")]}) - >>> df.select(pl.col("a").nan_max()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ NaN │ - └─────┘ - - ''' - def nan_min(self) -> Self: - ''' - Get minimum value, but propagate/poison encountered NaN values. - - This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0, float("nan")]}) - >>> df.select(pl.col("a").nan_min()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ NaN │ - └─────┘ - - ''' - def sum(self) -> Self: - ''' - Get sum value. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").sum()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 0 │ - └─────┘ - - ''' - def mean(self) -> Self: - ''' - Get mean value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").mean()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def median(self) -> Self: - ''' - Get median value using linear interpolation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").median()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def product(self) -> Self: - ''' - Compute the product of an expression. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").product()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 6 │ - └─────┘ - - ''' - def n_unique(self) -> Self: - ''' - Count unique values. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").n_unique()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def approx_n_unique(self) -> Self: - ''' - Approximate count of unique values. - - This is done using the HyperLogLog++ algorithm for cardinality estimation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").approx_n_unique()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def null_count(self) -> Self: - ''' - Count null values. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [None, 1, None], - ... "b": [1, 2, 3], - ... } - ... ) - >>> df.select(pl.all().null_count()) - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 2 ┆ 0 │ - └─────┴─────┘ - - ''' - def arg_unique(self) -> Self: - ''' - Get index of first unique value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [8, 9, 10], - ... "b": [None, 4, 4], - ... } - ... ) - >>> df.select(pl.col("a").arg_unique()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 0 │ - │ 1 │ - │ 2 │ - └─────┘ - >>> df.select(pl.col("b").arg_unique()) - shape: (2, 1) - ┌─────┐ - │ b │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 0 │ - │ 1 │ - └─────┘ - - ''' - def unique(self) -> Self: - ''' - Get unique values of this expression. - - Parameters - ---------- - maintain_order - Maintain order of data. This requires more work. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - │ 1 │ - └─────┘ - >>> df.select(pl.col("a").unique(maintain_order=True)) - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - └─────┘ - - ''' - def first(self) -> Self: - ''' - Get the first value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").first()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - └─────┘ - - ''' - def last(self) -> Self: - ''' - Get the last value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").last()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: - ''' - Compute expressions over the given groups. - - This expression is similar to performing a group by aggregation and joining the - result back into the original DataFrame. - - The outcome is similar to how `window functions - `_ - work in PostgreSQL. - - Parameters - ---------- - expr - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_exprs - Additional columns to group by, specified as positional arguments. - mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} - - group_to_rows - If the aggregation results in multiple values, assign them back to their - position in the DataFrame. This can only be done if the group yields - the same elements before aggregation as after. - - join - Join the groups as \'List\' to the row positions. - warning: this can be memory intensive. - - explode - Don\'t do any mapping, but simply flatten the group. - This only makes sense if the input data is sorted. - - Examples - -------- - Pass the name of a column to compute the expression over that column. - - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "a", "b", "b", "b"], - ... "b": [1, 2, 3, 5, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> df.with_columns( - ... pl.col("c").max().over("a").name.suffix("_max"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_max │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 5 │ - │ b ┆ 3 ┆ 3 ┆ 3 │ - │ b ┆ 5 ┆ 2 ┆ 3 │ - │ b ┆ 3 ┆ 1 ┆ 3 │ - └─────┴─────┴─────┴───────┘ - - Expression input is supported. - - >>> df.with_columns( - ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_max │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 4 │ - │ b ┆ 3 ┆ 3 ┆ 4 │ - │ b ┆ 5 ┆ 2 ┆ 2 │ - │ b ┆ 3 ┆ 1 ┆ 4 │ - └─────┴─────┴─────┴───────┘ - - Group by multiple columns by passing a list of column names or expressions. - - >>> df.with_columns( - ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_min │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 4 │ - │ b ┆ 3 ┆ 3 ┆ 1 │ - │ b ┆ 5 ┆ 2 ┆ 2 │ - │ b ┆ 3 ┆ 1 ┆ 1 │ - └─────┴─────┴─────┴───────┘ - - Or use positional arguments to group by multiple columns in the same way. - - >>> df.with_columns( - ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_min │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 4 │ - │ b ┆ 3 ┆ 3 ┆ 1 │ - │ b ┆ 5 ┆ 2 ┆ 1 │ - │ b ┆ 3 ┆ 1 ┆ 1 │ - └─────┴─────┴─────┴───────┘ - - ''' - def rolling(self, index_column: str) -> Self: - ''' - Create rolling groups based on a time, Int32, or Int64 column. - - If you have a time series ``, then by default the - windows created will be - - * (t_0 - period, t_0] - * (t_1 - period, t_1] - * ... - * (t_n - period, t_n] - - whereas if you pass a non-default `offset`, then the windows will be - - * (t_0 + offset, t_0 + offset + period] - * (t_1 + offset, t_1 + offset + period] - * ... - * (t_n + offset, t_n + offset + period] - - The `period` and `offset` arguments are created either from a timedelta, or - by using the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a rolling operation on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order. - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Examples - -------- - >>> dates = [ - ... "2020-01-01 13:45:48", - ... "2020-01-01 16:42:13", - ... "2020-01-01 16:45:09", - ... "2020-01-02 18:12:48", - ... "2020-01-03 19:45:32", - ... "2020-01-08 23:16:43", - ... ] - >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( - ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() - ... ) - >>> df.with_columns( - ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), - ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), - ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), - ... ) - shape: (6, 5) - ┌─────────────────────┬─────┬───────┬───────┬───────┐ - │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ - │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ - │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ - │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ - │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ - │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ - │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ - └─────────────────────┴─────┴───────┴───────┴───────┘ - - ''' - def is_unique(self) -> Self: - ''' - Get mask of unique values. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").is_unique()) - shape: (3, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ false │ - │ true │ - └───────┘ - - ''' - def is_first_distinct(self) -> Self: - ''' - Return a boolean mask indicating the first occurrence of each distinct value. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) - >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) - shape: (5, 2) - ┌─────┬───────┐ - │ a ┆ first │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪═══════╡ - │ 1 ┆ true │ - │ 1 ┆ false │ - │ 2 ┆ true │ - │ 3 ┆ true │ - │ 2 ┆ false │ - └─────┴───────┘ - - ''' - def is_last_distinct(self) -> Self: - ''' - Return a boolean mask indicating the last occurrence of each distinct value. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) - >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) - shape: (5, 2) - ┌─────┬───────┐ - │ a ┆ last │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪═══════╡ - │ 1 ┆ false │ - │ 1 ┆ true │ - │ 2 ┆ false │ - │ 3 ┆ true │ - │ 2 ┆ true │ - └─────┴───────┘ - - ''' - def is_duplicated(self) -> Self: - ''' - Return a boolean mask indicating duplicated values. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").is_duplicated()) - shape: (3, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ true │ - │ true │ - │ false │ - └───────┘ - - ''' - def peak_max(self) -> Self: - ''' - Get a boolean mask of the local maximum peaks. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) - >>> df.select(pl.col("a").peak_max()) - shape: (5, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ false │ - │ false │ - │ false │ - │ true │ - └───────┘ - - ''' - def peak_min(self) -> Self: - ''' - Get a boolean mask of the local minimum peaks. - - Examples - -------- - >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) - >>> df.select(pl.col("a").peak_min()) - shape: (5, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ true │ - │ false │ - │ true │ - │ false │ - └───────┘ - - ''' - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: - ''' - Get quantile value. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) - >>> df.select(pl.col("a").quantile(0.3)) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 2.0 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.5 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.5 │ - └─────┘ - - ''' - def cut(self, breaks: Sequence[float]) -> Self: - ''' - Bin continuous values into discrete categories. - - Parameters - ---------- - breaks - List of unique cut points. - labels - Names of the categories. The number of labels must be equal to the number - of cut points plus one. - left_closed - Set the intervals to be left-closed instead of right-closed. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - - Returns - ------- - Expr - Expression of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise an expression of data type :class:`Struct`. - - See Also - -------- - qcut - - Examples - -------- - Divide a column into three categories. - - >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) - >>> df.with_columns( - ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") - ... ) - shape: (5, 2) - ┌─────┬─────┐ - │ foo ┆ cut │ - │ --- ┆ --- │ - │ i64 ┆ cat │ - ╞═════╪═════╡ - │ -2 ┆ a │ - │ -1 ┆ a │ - │ 0 ┆ b │ - │ 1 ┆ b │ - │ 2 ┆ c │ - └─────┴─────┘ - - Add both the category and the breakpoint. - - >>> df.with_columns( - ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") - ... ).unnest("cut") - shape: (5, 3) - ┌─────┬──────┬────────────┐ - │ foo ┆ brk ┆ foo_bin │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪══════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴──────┴────────────┘ - - ''' - def qcut(self, quantiles: Sequence[float] | int) -> Self: - ''' - Bin continuous values into discrete categories based on their quantiles. - - Parameters - ---------- - quantiles - Either a list of quantile probabilities between 0 and 1 or a positive - integer determining the number of bins with uniform probability. - labels - Names of the categories. The number of labels must be equal to the number - of categories. - left_closed - Set the intervals to be left-closed instead of right-closed. - allow_duplicates - If set to `True`, duplicates in the resulting quantiles are dropped, - rather than raising a `DuplicateError`. This can happen even with unique - probabilities, depending on the data. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - - Returns - ------- - Expr - Expression of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise an expression of data type :class:`Struct`. - - See Also - -------- - cut - - Examples - -------- - Divide a column into three categories according to pre-defined quantile - probabilities. - - >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) - >>> df.with_columns( - ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") - ... ) - shape: (5, 2) - ┌─────┬──────┐ - │ foo ┆ qcut │ - │ --- ┆ --- │ - │ i64 ┆ cat │ - ╞═════╪══════╡ - │ -2 ┆ a │ - │ -1 ┆ a │ - │ 0 ┆ b │ - │ 1 ┆ b │ - │ 2 ┆ c │ - └─────┴──────┘ - - Divide a column into two categories using uniform quantile probabilities. - - >>> df.with_columns( - ... pl.col("foo") - ... .qcut(2, labels=["low", "high"], left_closed=True) - ... .alias("qcut") - ... ) - shape: (5, 2) - ┌─────┬──────┐ - │ foo ┆ qcut │ - │ --- ┆ --- │ - │ i64 ┆ cat │ - ╞═════╪══════╡ - │ -2 ┆ low │ - │ -1 ┆ low │ - │ 0 ┆ high │ - │ 1 ┆ high │ - │ 2 ┆ high │ - └─────┴──────┘ - - Add both the category and the breakpoint. - - >>> df.with_columns( - ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") - ... ).unnest("qcut") - shape: (5, 3) - ┌─────┬──────┬────────────┐ - │ foo ┆ brk ┆ foo_bin │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪══════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴──────┴────────────┘ - - ''' - def rle(self) -> Self: - ''' - Get the lengths of runs of identical values. - - Returns - ------- - Expr - Expression of data type :class:`Struct` with Fields "lengths" and "values". - - Examples - -------- - >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) - >>> df.select(pl.col("s").rle()).unnest("s") - shape: (6, 2) - ┌─────────┬────────┐ - │ lengths ┆ values │ - │ --- ┆ --- │ - │ i32 ┆ i64 │ - ╞═════════╪════════╡ - │ 2 ┆ 1 │ - │ 1 ┆ 2 │ - │ 1 ┆ 1 │ - │ 1 ┆ null │ - │ 1 ┆ 1 │ - │ 2 ┆ 3 │ - └─────────┴────────┘ - ''' - def rle_id(self) -> Self: - ''' - Map values to run IDs. - - Similar to RLE, but it maps each value to an ID corresponding to the run into - which it falls. This is especially useful when you want to define groups by - runs of identical values rather than the values themselves. - - - Examples - -------- - >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) - >>> # It works on structs of multiple values too! - >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) - shape: (5, 4) - ┌─────┬──────┬─────┬──────┐ - │ a ┆ b ┆ a_r ┆ ab_r │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ u32 ┆ u32 │ - ╞═════╪══════╪═════╪══════╡ - │ 1 ┆ x ┆ 0 ┆ 0 │ - │ 2 ┆ x ┆ 1 ┆ 1 │ - │ 1 ┆ null ┆ 2 ┆ 2 │ - │ 1 ┆ y ┆ 2 ┆ 3 │ - │ 1 ┆ y ┆ 2 ┆ 3 │ - └─────┴──────┴─────┴──────┘ - ''' - def filter(self, predicate: Expr) -> Self: - ''' - Filter a single column. - - The original order of the remaining elements is preserved. - - Mostly useful in an aggregation context. If you want to filter on a DataFrame - level, use `LazyFrame.filter`. - - Parameters - ---------- - predicate - Boolean expression. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group_col": ["g1", "g1", "g2"], - ... "b": [1, 2, 3], - ... } - ... ) - >>> df.group_by("group_col").agg( - ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), - ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), - ... ).sort("group_col") - shape: (2, 3) - ┌───────────┬─────┬─────┐ - │ group_col ┆ lt ┆ gte │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═══════════╪═════╪═════╡ - │ g1 ┆ 1 ┆ 2 │ - │ g2 ┆ 0 ┆ 3 │ - └───────────┴─────┴─────┘ - - ''' - def where(self, predicate: Expr) -> Self: - ''' - Filter a single column. - - Alias for :func:`filter`. - - Parameters - ---------- - predicate - Boolean expression. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group_col": ["g1", "g1", "g2"], - ... "b": [1, 2, 3], - ... } - ... ) - >>> df.group_by("group_col").agg( - ... [ - ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), - ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), - ... ] - ... ).sort("group_col") - shape: (2, 3) - ┌───────────┬─────┬─────┐ - │ group_col ┆ lt ┆ gte │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═══════════╪═════╪═════╡ - │ g1 ┆ 1 ┆ 2 │ - │ g2 ┆ 0 ┆ 3 │ - └───────────┴─────┴─────┘ - - ''' - def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: - ''' - Apply a custom python function to a whole Series or sequence of Series. - - The output of this custom function must be a Series. If you want to apply a - custom function elementwise over single values, see :func:`map_elements`. - A reasonable use case for `map` functions is transforming the values - represented by an expression using a third-party library. - - Read more in `the book - `_. - - Parameters - ---------- - function - Lambda/function to apply. - return_dtype - Dtype of the output Series. - agg_list - Aggregate list. - - Notes - ----- - If you are looking to map a function over a window function or group_by context, - refer to func:`map_elements` instead. - - Warnings - -------- - If `return_dtype` is not provided, this may lead to unexpected results. - We allow this, but it is considered a bug in the user\'s query. - - See Also - -------- - map_elements - replace - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "sine": [0.0, 1.0, 0.0, -1.0], - ... "cosine": [1.0, 0.0, -1.0, 0.0], - ... } - ... ) - >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) - shape: (1, 2) - ┌──────┬────────┐ - │ sine ┆ cosine │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪════════╡ - │ 1 ┆ 0 │ - └──────┴────────┘ - - ''' - def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - ''' - Map a custom/user-defined function (UDF) to each element of a column. - - .. warning:: - This method is much slower than the native expressions API. - Only use it if you cannot implement your logic otherwise. - - The UDF is applied to each element of a column. Note that, in a GroupBy - context, the column will have been pre-aggregated and so each element - will itself be a Series. Therefore, depending on the context, - requirements for `function` differ: - - * Selection - Expects `function` to be of type `Callable[[Any], Any]`. - Applies a Python function to each individual value in the column. - * GroupBy - Expects `function` to be of type `Callable[[Series], Any]`. - For each group, applies a Python function to the slice of the column - corresponding to that group. - - Parameters - ---------- - function - Lambda/function to map. - return_dtype - Dtype of the output Series. - If not set, the dtype will be `pl.Unknown`. - skip_nulls - Don\'t map the function over values that contain nulls (this is faster). - pass_name - Pass the Series name to the custom function (this is more expensive). - strategy : {\'thread_local\', \'threading\'} - This functionality is considered experimental and may be removed/changed. - - - \'thread_local\': run the python function on a single thread. - - \'threading\': run the python function on separate threads. Use with - care as this can slow performance. This might only speed up - your code if the amount of work per element is significant - and the python function releases the GIL (e.g. via calling - a c function) - - Notes - ----- - * Using `map_elements` is strongly discouraged as you will be effectively - running python "for" loops, which will be very slow. Wherever possible you - should prefer the native expression API to achieve the best performance. - - * If your function is expensive and you don\'t want it to be called more than - once for a given input, consider applying an `@lru_cache` decorator to it. - If your data is suitable you may achieve *significant* speedups. - - * Window function application using `over` is considered a GroupBy context - here, so `map_elements` can be used to map functions over window groups. - - Warnings - -------- - If `return_dtype` is not provided, this may lead to unexpected results. - We allow this, but it is considered a bug in the user\'s query. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["a", "b", "c", "c"], - ... } - ... ) - - The function is applied to each element of column `\'a\'`: - - >>> df.with_columns( # doctest: +SKIP - ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), - ... ) - shape: (4, 3) - ┌─────┬─────┬───────────┐ - │ a ┆ b ┆ a_times_2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 │ - ╞═════╪═════╪═══════════╡ - │ 1 ┆ a ┆ 2 │ - │ 2 ┆ b ┆ 4 │ - │ 3 ┆ c ┆ 6 │ - │ 1 ┆ c ┆ 2 │ - └─────┴─────┴───────────┘ - - Tip: it is better to implement this with an expression: - - >>> df.with_columns( - ... (pl.col("a") * 2).alias("a_times_2"), - ... ) # doctest: +IGNORE_RESULT - - In a GroupBy context, each element of the column is itself a Series: - - >>> ( - ... df.lazy().group_by("b").agg(pl.col("a")).collect() - ... ) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬───────────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════╪═══════════╡ - │ a ┆ [1] │ - │ b ┆ [2] │ - │ c ┆ [3, 1] │ - └─────┴───────────┘ - - Therefore, from the user\'s point-of-view, the function is applied per-group: - - >>> ( - ... df.lazy() - ... .group_by("b") - ... .agg(pl.col("a").map_elements(lambda x: x.sum())) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬─────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 1 │ - │ b ┆ 2 │ - │ c ┆ 4 │ - └─────┴─────┘ - - Tip: again, it is better to implement this with an expression: - - >>> ( - ... df.lazy() - ... .group_by("b", maintain_order=True) - ... .agg(pl.col("a").sum()) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - - Window function application using `over` will behave as a GroupBy - context, with your function receiving individual window groups: - - >>> df = pl.DataFrame( - ... { - ... "key": ["x", "x", "y", "x", "y", "z"], - ... "val": [1, 1, 1, 1, 1, 1], - ... } - ... ) - >>> df.with_columns( - ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), - ... ).sort("key") - shape: (6, 3) - ┌─────┬─────┬────────┐ - │ key ┆ val ┆ scaled │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪════════╡ - │ x ┆ 1 ┆ 3 │ - │ x ┆ 1 ┆ 3 │ - │ x ┆ 1 ┆ 3 │ - │ y ┆ 1 ┆ 2 │ - │ y ┆ 1 ┆ 2 │ - │ z ┆ 1 ┆ 1 │ - └─────┴─────┴────────┘ - - Note that this function would *also* be better-implemented natively: - - >>> df.with_columns( - ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), - ... ).sort( - ... "key" - ... ) # doctest: +IGNORE_RESULT - - ''' - def flatten(self) -> Self: - ''' - Flatten a list or string column. - - Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": ["a", "b", "b"], - ... "values": [[1, 2], [2, 3], [4]], - ... } - ... ) - >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ values │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪═══════════╡ - │ a ┆ [1, 2] │ - │ b ┆ [2, 3, 4] │ - └───────┴───────────┘ - - ''' - def explode(self) -> Self: - ''' - Explode a list expression. - - This means that every item is expanded to a new row. - - Returns - ------- - Expr - Expression with the data type of the list elements. - - See Also - -------- - Expr.list.explode : Explode a list column. - Expr.str.explode : Explode a string column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": ["a", "b"], - ... "values": [ - ... [1, 2], - ... [3, 4], - ... ], - ... } - ... ) - >>> df.select(pl.col("values").explode()) - shape: (4, 1) - ┌────────┐ - │ values │ - │ --- │ - │ i64 │ - ╞════════╡ - │ 1 │ - │ 2 │ - │ 3 │ - │ 4 │ - └────────┘ - - ''' - def implode(self) -> Self: - ''' - Aggregate values into a list. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": [4, 5, 6], - ... } - ... ) - >>> df.select(pl.all().implode()) - shape: (1, 2) - ┌───────────┬───────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ list[i64] ┆ list[i64] │ - ╞═══════════╪═══════════╡ - │ [1, 2, 3] ┆ [4, 5, 6] │ - └───────────┴───────────┘ - - ''' - def gather_every(self, n: int) -> Self: - ''' - Take every nth value in the Series and return as a new Series. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) - >>> df.select(pl.col("foo").gather_every(3)) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 4 │ - │ 7 │ - └─────┘ - - ''' - def head(self, n: int | Expr = ...) -> Self: - ''' - Get the first `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.head(3) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - ''' - def tail(self, n: int | Expr = ...) -> Self: - ''' - Get the last `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.tail(3) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 5 │ - │ 6 │ - │ 7 │ - └─────┘ - - ''' - def limit(self, n: int | Expr = ...) -> Self: - ''' - Get the first `n` rows (alias for :func:`Expr.head`). - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.limit(3) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - ''' - def and_(self, *others: Any) -> Self: - ''' - Method equivalent of bitwise "and" operator `expr & other & ...`. - - Parameters - ---------- - *others - One or more integer or boolean expressions to evaluate/combine. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5, 6, 7, 4, 8], - ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], - ... "z": [-9, 2, -1, 4, 8], - ... } - ... ) - >>> df.select( - ... (pl.col("x") >= pl.col("z")) - ... .and_( - ... pl.col("y") >= pl.col("z"), - ... pl.col("y") == pl.col("y"), - ... pl.col("z") <= pl.col("x"), - ... pl.col("y") != pl.col("x"), - ... ) - ... .alias("all") - ... ) - shape: (5, 1) - ┌───────┐ - │ all │ - │ --- │ - │ bool │ - ╞═══════╡ - │ true │ - │ true │ - │ true │ - │ false │ - │ false │ - └───────┘ - - ''' - def or_(self, *others: Any) -> Self: - ''' - Method equivalent of bitwise "or" operator `expr | other | ...`. - - Parameters - ---------- - *others - One or more integer or boolean expressions to evaluate/combine. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5, 6, 7, 4, 8], - ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], - ... "z": [-9, 2, -1, 4, 8], - ... } - ... ) - >>> df.select( - ... (pl.col("x") == pl.col("y")) - ... .or_( - ... pl.col("x") == pl.col("y"), - ... pl.col("y") == pl.col("z"), - ... pl.col("y").cast(int) == pl.col("z"), - ... ) - ... .alias("any") - ... ) - shape: (5, 1) - ┌───────┐ - │ any │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ true │ - │ false │ - │ true │ - │ false │ - └───────┘ - - ''' - def eq(self, other: Any) -> Self: - ''' - Method equivalent of equality operator `expr == other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0], - ... "y": [2.0, 2.0, float("nan"), 4.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").eq(pl.col("y")).alias("x == y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x == y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 1.0 ┆ 2.0 ┆ false │ - │ 2.0 ┆ 2.0 ┆ true │ - │ NaN ┆ NaN ┆ false │ - │ 4.0 ┆ 4.0 ┆ true │ - └─────┴─────┴────────┘ - - ''' - def eq_missing(self, other: Any) -> Self: - ''' - Method equivalent of equality operator `expr == other` where `None == None`. - - This differs from default `eq` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], - ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").eq(pl.col("y")).alias("x eq y"), - ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), - ... ) - shape: (6, 4) - ┌──────┬──────┬────────┬────────────────┐ - │ x ┆ y ┆ x eq y ┆ x eq_missing y │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪══════╪════════╪════════════════╡ - │ 1.0 ┆ 2.0 ┆ false ┆ false │ - │ 2.0 ┆ 2.0 ┆ true ┆ true │ - │ NaN ┆ NaN ┆ false ┆ false │ - │ 4.0 ┆ 4.0 ┆ true ┆ true │ - │ null ┆ 5.0 ┆ null ┆ false │ - │ null ┆ null ┆ null ┆ true │ - └──────┴──────┴────────┴────────────────┘ - - ''' - def ge(self, other: Any) -> Self: - ''' - Method equivalent of "greater than or equal" operator `expr >= other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5.0, 4.0, float("nan"), 2.0], - ... "y": [5.0, 3.0, float("nan"), 1.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").ge(pl.col("y")).alias("x >= y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x >= y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 5.0 ┆ 5.0 ┆ true │ - │ 4.0 ┆ 3.0 ┆ true │ - │ NaN ┆ NaN ┆ false │ - │ 2.0 ┆ 1.0 ┆ true │ - └─────┴─────┴────────┘ - - ''' - def gt(self, other: Any) -> Self: - ''' - Method equivalent of "greater than" operator `expr > other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5.0, 4.0, float("nan"), 2.0], - ... "y": [5.0, 3.0, float("nan"), 1.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").gt(pl.col("y")).alias("x > y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬───────┐ - │ x ┆ y ┆ x > y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪═══════╡ - │ 5.0 ┆ 5.0 ┆ false │ - │ 4.0 ┆ 3.0 ┆ true │ - │ NaN ┆ NaN ┆ false │ - │ 2.0 ┆ 1.0 ┆ true │ - └─────┴─────┴───────┘ - - ''' - def le(self, other: Any) -> Self: - ''' - Method equivalent of "less than or equal" operator `expr <= other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5.0, 4.0, float("nan"), 0.5], - ... "y": [5.0, 3.5, float("nan"), 2.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").le(pl.col("y")).alias("x <= y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x <= y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 5.0 ┆ 5.0 ┆ true │ - │ 4.0 ┆ 3.5 ┆ false │ - │ NaN ┆ NaN ┆ false │ - │ 0.5 ┆ 2.0 ┆ true │ - └─────┴─────┴────────┘ - - ''' - def lt(self, other: Any) -> Self: - ''' - Method equivalent of "less than" operator `expr < other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 3.0], - ... "y": [2.0, 2.0, float("nan"), 4.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").lt(pl.col("y")).alias("x < y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬───────┐ - │ x ┆ y ┆ x < y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪═══════╡ - │ 1.0 ┆ 2.0 ┆ true │ - │ 2.0 ┆ 2.0 ┆ false │ - │ NaN ┆ NaN ┆ false │ - │ 3.0 ┆ 4.0 ┆ true │ - └─────┴─────┴───────┘ - - ''' - def ne(self, other: Any) -> Self: - ''' - Method equivalent of inequality operator `expr != other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0], - ... "y": [2.0, 2.0, float("nan"), 4.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").ne(pl.col("y")).alias("x != y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x != y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 1.0 ┆ 2.0 ┆ true │ - │ 2.0 ┆ 2.0 ┆ false │ - │ NaN ┆ NaN ┆ true │ - │ 4.0 ┆ 4.0 ┆ false │ - └─────┴─────┴────────┘ - - ''' - def ne_missing(self, other: Any) -> Self: - ''' - Method equivalent of equality operator `expr != other` where `None == None`. - - This differs from default `ne` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], - ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").ne(pl.col("y")).alias("x ne y"), - ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), - ... ) - shape: (6, 4) - ┌──────┬──────┬────────┬────────────────┐ - │ x ┆ y ┆ x ne y ┆ x ne_missing y │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪══════╪════════╪════════════════╡ - │ 1.0 ┆ 2.0 ┆ true ┆ true │ - │ 2.0 ┆ 2.0 ┆ false ┆ false │ - │ NaN ┆ NaN ┆ true ┆ true │ - │ 4.0 ┆ 4.0 ┆ false ┆ false │ - │ null ┆ 5.0 ┆ null ┆ true │ - │ null ┆ null ┆ null ┆ false │ - └──────┴──────┴────────┴────────────────┘ - - ''' - def add(self, other: Any) -> Self: - ''' - Method equivalent of addition operator `expr + other`. - - Parameters - ---------- - other - numeric or string value; accepts expression input. - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) - >>> df.with_columns( - ... pl.col("x").add(2).alias("x+int"), - ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), - ... ) - shape: (5, 3) - ┌─────┬───────┬────────┐ - │ x ┆ x+int ┆ x+expr │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═══════╪════════╡ - │ 1 ┆ 3 ┆ 2 │ - │ 2 ┆ 4 ┆ 4 │ - │ 3 ┆ 5 ┆ 9 │ - │ 4 ┆ 6 ┆ 28 │ - │ 5 ┆ 7 ┆ 125 │ - └─────┴───────┴────────┘ - - >>> df = pl.DataFrame( - ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} - ... ) - >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) - shape: (3, 4) - ┌─────┬─────┬─────┬─────┐ - │ x ┆ y ┆ z ┆ xyz │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ str ┆ str │ - ╞═════╪═════╪═════╪═════╡ - │ a ┆ b ┆ c ┆ abc │ - │ d ┆ e ┆ f ┆ def │ - │ g ┆ h ┆ i ┆ ghi │ - └─────┴─────┴─────┴─────┘ - - ''' - def floordiv(self, other: Any) -> Self: - ''' - Method equivalent of integer division operator `expr // other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - See Also - -------- - truediv - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) - >>> df.with_columns( - ... pl.col("x").truediv(2).alias("x/2"), - ... pl.col("x").floordiv(2).alias("x//2"), - ... ) - shape: (5, 3) - ┌─────┬─────┬──────┐ - │ x ┆ x/2 ┆ x//2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ i64 │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 0.5 ┆ 0 │ - │ 2 ┆ 1.0 ┆ 1 │ - │ 3 ┆ 1.5 ┆ 1 │ - │ 4 ┆ 2.0 ┆ 2 │ - │ 5 ┆ 2.5 ┆ 2 │ - └─────┴─────┴──────┘ - - ''' - def mod(self, other: Any) -> Self: - ''' - Method equivalent of modulus operator `expr % other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) - >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) - shape: (5, 2) - ┌─────┬─────┐ - │ x ┆ x%2 │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 0 ┆ 0 │ - │ 1 ┆ 1 │ - │ 2 ┆ 0 │ - │ 3 ┆ 1 │ - │ 4 ┆ 0 │ - └─────┴─────┘ - - ''' - def mul(self, other: Any) -> Self: - ''' - Method equivalent of multiplication operator `expr * other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) - >>> df.with_columns( - ... pl.col("x").mul(2).alias("x*2"), - ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), - ... ) - shape: (5, 3) - ┌─────┬─────┬───────────┐ - │ x ┆ x*2 ┆ x * xlog2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ f64 │ - ╞═════╪═════╪═══════════╡ - │ 1 ┆ 2 ┆ 0.0 │ - │ 2 ┆ 4 ┆ 2.0 │ - │ 4 ┆ 8 ┆ 8.0 │ - │ 8 ┆ 16 ┆ 24.0 │ - │ 16 ┆ 32 ┆ 64.0 │ - └─────┴─────┴───────────┘ - - ''' - def sub(self, other: Any) -> Self: - ''' - Method equivalent of subtraction operator `expr - other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("x").sub(2).alias("x-2"), - ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), - ... ) - shape: (5, 3) - ┌─────┬─────┬────────┐ - │ x ┆ x-2 ┆ x-expr │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪════════╡ - │ 0 ┆ -2 ┆ 0 │ - │ 1 ┆ -1 ┆ 0 │ - │ 2 ┆ 0 ┆ -1 │ - │ 3 ┆ 1 ┆ -3 │ - │ 4 ┆ 2 ┆ -6 │ - └─────┴─────┴────────┘ - - ''' - def truediv(self, other: Any) -> Self: - ''' - Method equivalent of float division operator `expr / other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Notes - ----- - Zero-division behaviour follows IEEE-754: - - 0/0: Invalid operation - mathematically undefined, returns NaN. - n/0: On finite operands gives an exact infinite result, eg: ±infinity. - - See Also - -------- - floordiv - - Examples - -------- - >>> df = pl.DataFrame( - ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} - ... ) - >>> df.with_columns( - ... pl.col("x").truediv(2).alias("x/2"), - ... pl.col("x").truediv(pl.col("y")).alias("x/y"), - ... ) - shape: (5, 4) - ┌─────┬──────┬──────┬───────┐ - │ x ┆ y ┆ x/2 ┆ x/y │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ f64 ┆ f64 │ - ╞═════╪══════╪══════╪═══════╡ - │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ - │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ - │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ - │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ - │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ - └─────┴──────┴──────┴───────┘ - - ''' - def pow(self, exponent: int | float | None | Series | Expr) -> Self: - ''' - Method equivalent of exponentiation operator `expr ** exponent`. - - Parameters - ---------- - exponent - Numeric literal or expression exponent value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) - >>> df.with_columns( - ... pl.col("x").pow(3).alias("cube"), - ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), - ... ) - shape: (4, 3) - ┌─────┬───────┬────────────┐ - │ x ┆ cube ┆ x ** xlog2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ f64 │ - ╞═════╪═══════╪════════════╡ - │ 1 ┆ 1.0 ┆ 1.0 │ - │ 2 ┆ 8.0 ┆ 2.0 │ - │ 4 ┆ 64.0 ┆ 16.0 │ - │ 8 ┆ 512.0 ┆ 512.0 │ - └─────┴───────┴────────────┘ - - ''' - def xor(self, other: Any) -> Self: - ''' - Method equivalent of bitwise exclusive-or operator `expr ^ other`. - - Parameters - ---------- - other - Integer or boolean value; accepts expression input. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"x": [True, False, True, False], "y": [True, True, False, False]} - ... ) - >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) - shape: (4, 3) - ┌───────┬───────┬───────┐ - │ x ┆ y ┆ x ^ y │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞═══════╪═══════╪═══════╡ - │ true ┆ true ┆ false │ - │ false ┆ true ┆ true │ - │ true ┆ false ┆ true │ - │ false ┆ false ┆ false │ - └───────┴───────┴───────┘ - - >>> def binary_string(n: int) -> str: - ... return bin(n)[2:].zfill(8) - >>> - >>> df = pl.DataFrame( - ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, - ... schema={"x": pl.UInt8, "y": pl.UInt8}, - ... ) - >>> df.with_columns( - ... pl.col("x").map_elements(binary_string).alias("bin_x"), - ... pl.col("y").map_elements(binary_string).alias("bin_y"), - ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), - ... pl.col("x") - ... .xor(pl.col("y")) - ... .map_elements(binary_string) - ... .alias("bin_xor_xy"), - ... ) - shape: (4, 6) - ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ - │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ - ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ - │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ - │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ - │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ - │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ - └─────┴─────┴──────────┴──────────┴────────┴────────────┘ - - ''' - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: - ''' - Check if elements of this expression are present in the other Series. - - Parameters - ---------- - other - Series or sequence of primitive type. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} - ... ) - >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) - shape: (3, 3) - ┌───────────┬──────────────────┬──────────┐ - │ sets ┆ optional_members ┆ contains │ - │ --- ┆ --- ┆ --- │ - │ list[i64] ┆ i64 ┆ bool │ - ╞═══════════╪══════════════════╪══════════╡ - │ [1, 2, 3] ┆ 1 ┆ true │ - │ [1, 2] ┆ 2 ┆ true │ - │ [9, 10] ┆ 3 ┆ false │ - └───────────┴──────────────────┴──────────┘ - - ''' - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: - ''' - Repeat the elements in this Series as specified in the given expression. - - The repeated elements are expanded into a `List`. - - Parameters - ---------- - by - Numeric column that determines how often the values will be repeated. - The column will be coerced to UInt32. Give this dtype to make the coercion a - no-op. - - Returns - ------- - Expr - Expression of data type :class:`List`, where the inner data type is equal - to the original data type. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["x", "y", "z"], - ... "n": [1, 2, 3], - ... } - ... ) - >>> df.select(pl.col("a").repeat_by("n")) - shape: (3, 1) - ┌─────────────────┐ - │ a │ - │ --- │ - │ list[str] │ - ╞═════════════════╡ - │ ["x"] │ - │ ["y", "y"] │ - │ ["z", "z", "z"] │ - └─────────────────┘ - - ''' - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: - ''' - Check if this expression is between the given start and end values. - - Parameters - ---------- - lower_bound - Lower bound value. Accepts expression input. Strings are parsed as column - names, other non-expression inputs are parsed as literals. - upper_bound - Upper bound value. Accepts expression input. Strings are parsed as column - names, other non-expression inputs are parsed as literals. - closed : {\'both\', \'left\', \'right\', \'none\'} - Define which sides of the interval are closed (inclusive). - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) - >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) - shape: (5, 2) - ┌─────┬────────────┐ - │ num ┆ is_between │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪════════════╡ - │ 1 ┆ false │ - │ 2 ┆ true │ - │ 3 ┆ true │ - │ 4 ┆ true │ - │ 5 ┆ false │ - └─────┴────────────┘ - - Use the `closed` argument to include or exclude the values at the bounds: - - >>> df.with_columns( - ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") - ... ) - shape: (5, 2) - ┌─────┬────────────┐ - │ num ┆ is_between │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪════════════╡ - │ 1 ┆ false │ - │ 2 ┆ true │ - │ 3 ┆ true │ - │ 4 ┆ false │ - │ 5 ┆ false │ - └─────┴────────────┘ - - You can also use strings as well as numeric/temporal values (note: ensure that - string literals are wrapped with `lit` so as not to conflate them with - column names): - - >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) - >>> df.with_columns( - ... pl.col("a") - ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") - ... .alias("is_between") - ... ) - shape: (5, 2) - ┌─────┬────────────┐ - │ a ┆ is_between │ - │ --- ┆ --- │ - │ str ┆ bool │ - ╞═════╪════════════╡ - │ a ┆ true │ - │ b ┆ true │ - │ c ┆ true │ - │ d ┆ false │ - │ e ┆ false │ - └─────┴────────────┘ - - ''' - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: - ''' - Hash the elements in the selection. - - The hash value is of type `UInt64`. - - Parameters - ---------- - seed - Random seed parameter. Defaults to 0. - seed_1 - Random seed parameter. Defaults to `seed` if not set. - seed_2 - Random seed parameter. Defaults to `seed` if not set. - seed_3 - Random seed parameter. Defaults to `seed` if not set. - - Notes - ----- - This implementation of :func:`rows` does not guarantee stable results - across different Polars versions. Its stability is only guaranteed within a - single version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": ["x", None, "z"], - ... } - ... ) - >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌──────────────────────┬──────────────────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u64 ┆ u64 │ - ╞══════════════════════╪══════════════════════╡ - │ 9774092659964970114 ┆ 13614470193936745724 │ - │ 1101441246220388612 ┆ 11638928888656214026 │ - │ 11638928888656214026 ┆ 13382926553367784577 │ - └──────────────────────┴──────────────────────┘ - - ''' - def reinterpret(self) -> Self: - ''' - Reinterpret the underlying bits as a signed/unsigned integer. - - This operation is only allowed for 64bit integers. For lower bits integers, - you can safely use that cast operation. - - Parameters - ---------- - signed - If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. - - Examples - -------- - >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) - >>> df = pl.DataFrame([s]) - >>> df.select( - ... [ - ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), - ... pl.col("a").alias("original"), - ... ] - ... ) - shape: (3, 2) - ┌───────────────┬──────────┐ - │ reinterpreted ┆ original │ - │ --- ┆ --- │ - │ i64 ┆ u64 │ - ╞═══════════════╪══════════╡ - │ 1 ┆ 1 │ - │ 1 ┆ 1 │ - │ 2 ┆ 2 │ - └───────────────┴──────────┘ - - ''' - def inspect(self, fmt: str = ...) -> Self: - ''' - Print the value that this expression evaluates to and pass on the value. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 1, 2]}) - >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) - value is: shape: (3,) - Series: \'foo\' [i64] - [ - 1 - 2 - 4 - ] - shape: (3, 1) - ┌─────┐ - │ bar │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 4 │ - └─────┘ - - ''' - def interpolate(self, method: InterpolationMethod = ...) -> Self: - ''' - Fill null values using interpolation. - - Parameters - ---------- - method : {\'linear\', \'nearest\'} - Interpolation method. - - Examples - -------- - Fill null values using linear interpolation. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, None, 3], - ... "b": [1.0, float("nan"), 3.0], - ... } - ... ) - >>> df.select(pl.all().interpolate()) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 1.0 ┆ 1.0 │ - │ 2.0 ┆ NaN │ - │ 3.0 ┆ 3.0 │ - └─────┴─────┘ - - Fill null values using nearest interpolation. - - >>> df.select(pl.all().interpolate("nearest")) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪═════╡ - │ 1 ┆ 1.0 │ - │ 3 ┆ NaN │ - │ 3 ┆ 3.0 │ - └─────┴─────┘ - - Regrid data to a new grid. - - >>> df_original_grid = pl.DataFrame( - ... { - ... "grid_points": [1, 3, 10], - ... "values": [2.0, 6.0, 20.0], - ... } - ... ) # Interpolate from this to the new grid - >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) - >>> df_new_grid.join( - ... df_original_grid, on="grid_points", how="left" - ... ).with_columns(pl.col("values").interpolate()) - shape: (10, 2) - ┌─────────────┬────────┐ - │ grid_points ┆ values │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════════════╪════════╡ - │ 1 ┆ 2.0 │ - │ 2 ┆ 4.0 │ - │ 3 ┆ 6.0 │ - │ 4 ┆ 8.0 │ - │ … ┆ … │ - │ 7 ┆ 14.0 │ - │ 8 ┆ 16.0 │ - │ 9 ┆ 18.0 │ - │ 10 ┆ 20.0 │ - └─────────────┴────────┘ - - ''' - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling min (moving min) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their min. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic - temporal size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_min=pl.col("A").rolling_min(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_min │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 2.0 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ 4.0 │ - │ 6.0 ┆ 5.0 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_min=pl.col("A").rolling_min( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_min │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.25 │ - │ 3.0 ┆ 0.5 │ - │ 4.0 ┆ 0.75 │ - │ 5.0 ┆ 1.0 │ - │ 6.0 ┆ 1.25 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_min │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 2.0 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ 4.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - >>> df_temporal.with_columns( - ... rolling_row_min=pl.col("row_nr").rolling_min( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_min │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling max (moving max) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their max. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_max=pl.col("A").rolling_max(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_max │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 2.0 │ - │ 3.0 ┆ 3.0 │ - │ 4.0 ┆ 4.0 │ - │ 5.0 ┆ 5.0 │ - │ 6.0 ┆ 6.0 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_max=pl.col("A").rolling_max( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_max │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.25 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ 3.75 │ - │ 6.0 ┆ 4.5 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_max │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 3.0 │ - │ 3.0 ┆ 4.0 │ - │ 4.0 ┆ 5.0 │ - │ 5.0 ┆ 6.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling max with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_max=pl.col("row_nr").rolling_max( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_max │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling max with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_max=pl.col("row_nr").rolling_max( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_max │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling mean (moving mean) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their mean. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_mean=pl.col("A").rolling_mean(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬──────────────┐ - │ A ┆ rolling_mean │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.5 │ - │ 4.0 ┆ 3.5 │ - │ 5.0 ┆ 4.5 │ - │ 6.0 ┆ 5.5 │ - └─────┴──────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_mean=pl.col("A").rolling_mean( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────┐ - │ A ┆ rolling_mean │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.75 │ - │ 3.0 ┆ 2.75 │ - │ 4.0 ┆ 3.75 │ - │ 5.0 ┆ 4.75 │ - │ 6.0 ┆ 5.75 │ - └─────┴──────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬──────────────┐ - │ A ┆ rolling_mean │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 2.0 │ - │ 3.0 ┆ 3.0 │ - │ 4.0 ┆ 4.0 │ - │ 5.0 ┆ 5.0 │ - │ 6.0 ┆ null │ - └─────┴──────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling mean with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_mean=pl.col("row_nr").rolling_mean( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬──────────────────┐ - │ row_nr ┆ date ┆ rolling_row_mean │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪══════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ - └────────┴─────────────────────┴──────────────────┘ - - Compute the rolling mean with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_mean=pl.col("row_nr").rolling_mean( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬──────────────────┐ - │ row_nr ┆ date ┆ rolling_row_mean │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪══════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ - └────────┴─────────────────────┴──────────────────┘ - - ''' - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling sum (moving sum) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their sum. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - of dtype `{Date, Datetime}` - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_sum=pl.col("A").rolling_sum(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_sum │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 3.0 │ - │ 3.0 ┆ 5.0 │ - │ 4.0 ┆ 7.0 │ - │ 5.0 ┆ 9.0 │ - │ 6.0 ┆ 11.0 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_sum=pl.col("A").rolling_sum( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_sum │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.75 │ - │ 3.0 ┆ 2.75 │ - │ 4.0 ┆ 3.75 │ - │ 5.0 ┆ 4.75 │ - │ 6.0 ┆ 5.75 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_sum │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 6.0 │ - │ 3.0 ┆ 9.0 │ - │ 4.0 ┆ 12.0 │ - │ 5.0 ┆ 15.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling sum with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_sum=pl.col("row_nr").rolling_sum( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_sum │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling sum with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_sum=pl.col("row_nr").rolling_sum( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_sum │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling standard deviation. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` means - the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_std=pl.col("A").rolling_std(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_std │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.707107 │ - │ 3.0 ┆ 0.707107 │ - │ 4.0 ┆ 0.707107 │ - │ 5.0 ┆ 0.707107 │ - │ 6.0 ┆ 0.707107 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_std=pl.col("A").rolling_std( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_std │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.433013 │ - │ 3.0 ┆ 0.433013 │ - │ 4.0 ┆ 0.433013 │ - │ 5.0 ┆ 0.433013 │ - │ 6.0 ┆ 0.433013 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_std │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 1.0 │ - │ 4.0 ┆ 1.0 │ - │ 5.0 ┆ 1.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling std with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_std=pl.col("row_nr").rolling_std( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_std │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling std with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_std=pl.col("row_nr").rolling_std( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_std │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling variance. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_var=pl.col("A").rolling_var(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_var │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.5 │ - │ 3.0 ┆ 0.5 │ - │ 4.0 ┆ 0.5 │ - │ 5.0 ┆ 0.5 │ - │ 6.0 ┆ 0.5 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_var=pl.col("A").rolling_var( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_var │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.1875 │ - │ 3.0 ┆ 0.1875 │ - │ 4.0 ┆ 0.1875 │ - │ 5.0 ┆ 0.1875 │ - │ 6.0 ┆ 0.1875 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_var │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 1.0 │ - │ 4.0 ┆ 1.0 │ - │ 5.0 ┆ 1.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling var with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_var=pl.col("row_nr").rolling_var( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_var │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling var with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_var=pl.col("row_nr").rolling_var( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_var │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling median. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` means - the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_median=pl.col("A").rolling_median(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬────────────────┐ - │ A ┆ rolling_median │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.5 │ - │ 4.0 ┆ 3.5 │ - │ 5.0 ┆ 4.5 │ - │ 6.0 ┆ 5.5 │ - └─────┴────────────────┘ - - Specify weights for the values in each window: - - >>> df.with_columns( - ... rolling_median=pl.col("A").rolling_median( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬────────────────┐ - │ A ┆ rolling_median │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.5 │ - │ 4.0 ┆ 3.5 │ - │ 5.0 ┆ 4.5 │ - │ 6.0 ┆ 5.5 │ - └─────┴────────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬────────────────┐ - │ A ┆ rolling_median │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 2.0 │ - │ 3.0 ┆ 3.0 │ - │ 4.0 ┆ 4.0 │ - │ 5.0 ┆ 5.0 │ - │ 6.0 ┆ null │ - └─────┴────────────────┘ - - ''' - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling quantile. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - window_size - The length of the window. Can be a fixed integer size, or a dynamic - temporal size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.25, window_size=4 - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ null │ - │ 4.0 ┆ 2.0 │ - │ 5.0 ┆ 3.0 │ - │ 6.0 ┆ 4.0 │ - └─────┴──────────────────┘ - - Specify weights for the values in each window: - - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ null │ - │ 4.0 ┆ 2.0 │ - │ 5.0 ┆ 3.0 │ - │ 6.0 ┆ 4.0 │ - └─────┴──────────────────┘ - - Specify weights and interpolation method - - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.25, - ... window_size=4, - ... weights=[0.2, 0.4, 0.4, 0.2], - ... interpolation="linear", - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ null │ - │ 4.0 ┆ 1.625 │ - │ 5.0 ┆ 2.625 │ - │ 6.0 ┆ 3.625 │ - └─────┴──────────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.2, window_size=5, center=True - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ 2.0 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ null │ - │ 6.0 ┆ null │ - └─────┴──────────────────┘ - - ''' - def rolling_skew(self, window_size: int) -> Self: - ''' - Compute a rolling skew. - - The window at a given row includes the row itself and the - `window_size - 1` elements before it. - - Parameters - ---------- - window_size - Integer size of the rolling window. - bias - If False, the calculations are corrected for statistical bias. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) - >>> df.select(pl.col("a").rolling_skew(3)) - shape: (4, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ null │ - │ null │ - │ 0.381802 │ - │ 0.47033 │ - └──────────┘ - - Note how the values match the following: - - >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() - (0.38180177416060584, 0.47033046033698594) - - ''' - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a custom rolling window function. - - .. warning:: - Computing custom functions is extremely slow. Use specialized rolling - functions such as :func:`Expr.rolling_sum` if at all possible. - - Parameters - ---------- - function - Custom aggregation function. - window_size - Size of the window. The window at a given row will include the row - itself and the `window_size - 1` elements before it. - weights - A list of weights with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window. - - Examples - -------- - >>> from numpy import nansum - >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) - >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) - shape: (5, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ null │ - │ null │ - │ 22.0 │ - │ 11.0 │ - │ 17.0 │ - └──────┘ - - ''' - def abs(self) -> Self: - ''' - Compute absolute values. - - Same as `abs(expr)`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [-1.0, 0.0, 1.0, 2.0], - ... } - ... ) - >>> df.select(pl.col("A").abs()) - shape: (4, 1) - ┌─────┐ - │ A │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - │ 0.0 │ - │ 1.0 │ - │ 2.0 │ - └─────┘ - - ''' - def rank(self, method: RankMethod = ...) -> Self: - ''' - Assign ranks to data, dealing with ties appropriately. - - Parameters - ---------- - method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} - The method used to assign ranks to tied elements. - The following methods are available (default is \'average\'): - - - \'average\' : The average of the ranks that would have been assigned to - all the tied values is assigned to each value. - - \'min\' : The minimum of the ranks that would have been assigned to all - the tied values is assigned to each value. (This is also referred to - as "competition" ranking.) - - \'max\' : The maximum of the ranks that would have been assigned to all - the tied values is assigned to each value. - - \'dense\' : Like \'min\', but the rank of the next highest element is - assigned the rank immediately after those assigned to the tied - elements. - - \'ordinal\' : All values are given a distinct rank, corresponding to - the order that the values occur in the Series. - - \'random\' : Like \'ordinal\', but the rank for ties is not dependent - on the order that the values occur in the Series. - descending - Rank in descending order. - seed - If `method="random"`, use this as seed. - - Examples - -------- - The \'average\' method: - - >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) - >>> df.select(pl.col("a").rank()) - shape: (5, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 3.0 │ - │ 4.5 │ - │ 1.5 │ - │ 1.5 │ - │ 4.5 │ - └─────┘ - - The \'ordinal\' method: - - >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) - >>> df.select(pl.col("a").rank("ordinal")) - shape: (5, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 3 │ - │ 4 │ - │ 1 │ - │ 2 │ - │ 5 │ - └─────┘ - - Use \'rank\' with \'over\' to rank within groups: - - >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) - >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) - shape: (5, 3) - ┌─────┬─────┬──────┐ - │ a ┆ b ┆ rank │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ f64 │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 6 ┆ 1.0 │ - │ 1 ┆ 7 ┆ 2.0 │ - │ 2 ┆ 5 ┆ 1.0 │ - │ 2 ┆ 14 ┆ 3.0 │ - │ 2 ┆ 11 ┆ 2.0 │ - └─────┴─────┴──────┘ - - ''' - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: - ''' - Calculate the first discrete difference between shifted items. - - Parameters - ---------- - n - Number of slots to shift. - null_behavior : {\'ignore\', \'drop\'} - How to handle null values. - - Examples - -------- - >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) - >>> df.with_columns(change=pl.col("int").diff()) - shape: (5, 2) - ┌─────┬────────┐ - │ int ┆ change │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪════════╡ - │ 20 ┆ null │ - │ 10 ┆ -10 │ - │ 30 ┆ 20 │ - │ 25 ┆ -5 │ - │ 35 ┆ 10 │ - └─────┴────────┘ - - >>> df.with_columns(change=pl.col("int").diff(n=2)) - shape: (5, 2) - ┌─────┬────────┐ - │ int ┆ change │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪════════╡ - │ 20 ┆ null │ - │ 10 ┆ null │ - │ 30 ┆ 10 │ - │ 25 ┆ 15 │ - │ 35 ┆ 5 │ - └─────┴────────┘ - - >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) - shape: (3, 1) - ┌──────┐ - │ diff │ - │ --- │ - │ i64 │ - ╞══════╡ - │ 10 │ - │ 15 │ - │ 5 │ - └──────┘ - - ''' - def pct_change(self, n: int | IntoExprColumn = ...) -> Self: - ''' - Computes percentage change between values. - - Percentage change (as fraction) between current element and most-recent - non-null element at least `n` period(s) before the current element. - - Computes the change from the previous row by default. - - Parameters - ---------- - n - periods to shift for forming percent change. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [10, 11, 12, None, 12], - ... } - ... ) - >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) - shape: (5, 2) - ┌──────┬────────────┐ - │ a ┆ pct_change │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞══════╪════════════╡ - │ 10 ┆ null │ - │ 11 ┆ 0.1 │ - │ 12 ┆ 0.090909 │ - │ null ┆ 0.0 │ - │ 12 ┆ 0.0 │ - └──────┴────────────┘ - - ''' - def skew(self) -> Self: - ''' - Compute the sample skewness of a data set. - - For normally distributed data, the skewness should be about zero. For - unimodal continuous distributions, a skewness value greater than zero means - that there is more weight in the right tail of the distribution. The - function `skewtest` can be used to determine if the skewness value - is close enough to zero, statistically speaking. - - - See scipy.stats for more information. - - Parameters - ---------- - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - Notes - ----- - The sample skewness is computed as the Fisher-Pearson coefficient - of skewness, i.e. - - .. math:: g_1=\\frac{m_3}{m_2^{3/2}} - - where - - .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i - - is the biased sample :math:`i\\texttt{th}` central moment, and - :math:`\\bar{x}` is - the sample mean. If `bias` is False, the calculations are - corrected for bias and the value computed is the adjusted - Fisher-Pearson standardized moment coefficient, i.e. - - .. math:: - G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").skew()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.343622 │ - └──────────┘ - - ''' - def kurtosis(self) -> Self: - ''' - Compute the kurtosis (Fisher or Pearson) of a dataset. - - Kurtosis is the fourth central moment divided by the square of the - variance. If Fisher\'s definition is used, then 3.0 is subtracted from - the result to give 0.0 for a normal distribution. - If bias is False then the kurtosis is calculated using k statistics to - eliminate bias coming from biased moment estimators. - - See scipy.stats for more information - - Parameters - ---------- - fisher : bool, optional - If True, Fisher\'s definition is used (normal ==> 0.0). If False, - Pearson\'s definition is used (normal ==> 3.0). - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").kurtosis()) - shape: (1, 1) - ┌───────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═══════════╡ - │ -1.153061 │ - └───────────┘ - - ''' - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: - ''' - Set values outside the given boundaries to the boundary value. - - Parameters - ---------- - lower_bound - Lower bound. Accepts expression input. - Non-expression inputs are parsed as literals. - upper_bound - Upper bound. Accepts expression input. - Non-expression inputs are parsed as literals. - - See Also - -------- - when - - Notes - ----- - This method only works for numeric and temporal columns. To clip other data - types, consider writing a `when-then-otherwise` expression. See :func:`when`. - - Examples - -------- - Specifying both a lower and upper bound: - - >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) - >>> df.with_columns(clip=pl.col("a").clip(1, 10)) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ clip │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ -50 ┆ 1 │ - │ 5 ┆ 5 │ - │ 50 ┆ 10 │ - │ null ┆ null │ - └──────┴──────┘ - - Specifying only a single bound: - - >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ clip │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ -50 ┆ -50 │ - │ 5 ┆ 5 │ - │ 50 ┆ 10 │ - │ null ┆ null │ - └──────┴──────┘ - - ''' - def lower_bound(self) -> Self: - ''' - Calculate the lower bound. - - Returns a unit Series with the lowest value possible for the dtype of this - expression. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").lower_bound()) - shape: (1, 1) - ┌──────────────────────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════════════════════╡ - │ -9223372036854775808 │ - └──────────────────────┘ - - ''' - def upper_bound(self) -> Self: - ''' - Calculate the upper bound. - - Returns a unit Series with the highest value possible for the dtype of this - expression. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").upper_bound()) - shape: (1, 1) - ┌─────────────────────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════════════════════╡ - │ 9223372036854775807 │ - └─────────────────────┘ - - ''' - def sign(self) -> Self: - ''' - Compute the element-wise indication of the sign. - - The returned values can be -1, 0, or 1: - - * -1 if x < 0. - * 0 if x == 0. - * 1 if x > 0. - - (null values are preserved as-is). - - Examples - -------- - >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) - >>> df.select(pl.col("a").sign()) - shape: (5, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ -1 │ - │ 0 │ - │ 0 │ - │ 1 │ - │ null │ - └──────┘ - - ''' - def sin(self) -> Self: - ''' - Compute the element-wise value for the sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.0]}) - >>> df.select(pl.col("a").sin()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def cos(self) -> Self: - ''' - Compute the element-wise value for the cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.0]}) - >>> df.select(pl.col("a").cos()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def tan(self) -> Self: - ''' - Compute the element-wise value for the tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").tan().round(2)) - shape: (1, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ 1.56 │ - └──────┘ - - ''' - def cot(self) -> Self: - ''' - Compute the element-wise value for the cotangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").cot().round(2)) - shape: (1, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ 0.64 │ - └──────┘ - - ''' - def arcsin(self) -> Self: - ''' - Compute the element-wise value for the inverse sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arcsin()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.570796 │ - └──────────┘ - - ''' - def arccos(self) -> Self: - ''' - Compute the element-wise value for the inverse cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.0]}) - >>> df.select(pl.col("a").arccos()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.570796 │ - └──────────┘ - - ''' - def arctan(self) -> Self: - ''' - Compute the element-wise value for the inverse tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arctan()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.785398 │ - └──────────┘ - - ''' - def sinh(self) -> Self: - ''' - Compute the element-wise value for the hyperbolic sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").sinh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.175201 │ - └──────────┘ - - ''' - def cosh(self) -> Self: - ''' - Compute the element-wise value for the hyperbolic cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").cosh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.543081 │ - └──────────┘ - - ''' - def tanh(self) -> Self: - ''' - Compute the element-wise value for the hyperbolic tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").tanh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.761594 │ - └──────────┘ - - ''' - def arcsinh(self) -> Self: - ''' - Compute the element-wise value for the inverse hyperbolic sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arcsinh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.881374 │ - └──────────┘ - - ''' - def arccosh(self) -> Self: - ''' - Compute the element-wise value for the inverse hyperbolic cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arccosh()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def arctanh(self) -> Self: - ''' - Compute the element-wise value for the inverse hyperbolic tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arctanh()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ inf │ - └─────┘ - - ''' - def degrees(self) -> Self: - ''' - Convert from radians to degrees. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> import math - >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) - >>> df.select(pl.col("a").degrees()) - shape: (9, 1) - ┌────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞════════╡ - │ -720.0 │ - │ -540.0 │ - │ -360.0 │ - │ -180.0 │ - │ 0.0 │ - │ 180.0 │ - │ 360.0 │ - │ 540.0 │ - │ 720.0 │ - └────────┘ - ''' - def radians(self) -> Self: - ''' - Convert from degrees to radians. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) - >>> df.select(pl.col("a").radians()) - shape: (9, 1) - ┌────────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞════════════╡ - │ -12.566371 │ - │ -9.424778 │ - │ -6.283185 │ - │ -3.141593 │ - │ 0.0 │ - │ 3.141593 │ - │ 6.283185 │ - │ 9.424778 │ - │ 12.566371 │ - └────────────┘ - ''' - def reshape(self, dimensions: tuple[int, ...]) -> Self: - ''' - Reshape this Expr to a flat Series or a Series of Lists. - - Parameters - ---------- - dimensions - Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that - dimension is inferred. - - Returns - ------- - Expr - If a single dimension is given, results in an expression of the original - data type. - If a multiple dimensions are given, results in an expression of data type - :class:`List` with shape (rows, cols). - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) - >>> df.select(pl.col("foo").reshape((3, 3))) - shape: (3, 1) - ┌───────────┐ - │ foo │ - │ --- │ - │ list[i64] │ - ╞═══════════╡ - │ [1, 2, 3] │ - │ [4, 5, 6] │ - │ [7, 8, 9] │ - └───────────┘ - - See Also - -------- - Expr.list.explode : Explode a list column. - - ''' - def shuffle(self, seed: int | None = ...) -> Self: - ''' - Shuffle the contents of this expression. - - Parameters - ---------- - seed - Seed for the random number generator. If set to None (default), a - random seed is generated each time the shuffle is called. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").shuffle(seed=1)) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - │ 1 │ - │ 3 │ - └─────┘ - - ''' - def sample(self, n: int | IntoExprColumn | None = ...) -> Self: - ''' - Sample from this expression. - - Parameters - ---------- - n - Number of items to return. Cannot be used with `fraction`. Defaults to 1 if - `fraction` is None. - fraction - Fraction of items to return. Cannot be used with `n`. - with_replacement - Allow values to be sampled more than once. - shuffle - Shuffle the order of sampled data points. - seed - Seed for the random number generator. If set to None (default), a - random seed is generated for each sample operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 3 │ - │ 1 │ - │ 1 │ - └─────┘ - - ''' - def ewm_mean(self) -> Self: - ''' - Exponentially-weighted moving average. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").ewm_mean(com=1)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.0 │ - │ 1.666667 │ - │ 2.428571 │ - └──────────┘ - - ''' - def ewm_std(self) -> Self: - ''' - Exponentially-weighted moving standard deviation. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").ewm_std(com=1)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.0 │ - │ 0.707107 │ - │ 0.963624 │ - └──────────┘ - - ''' - def ewm_var(self) -> Self: - ''' - Exponentially-weighted moving variance. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").ewm_var(com=1)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.0 │ - │ 0.5 │ - │ 0.928571 │ - └──────────┘ - - ''' - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: - ''' - Extremely fast method for extending the Series with \'n\' copies of a value. - - Parameters - ---------- - value - A constant literal value (not an expression) with which to extend the - expression result Series; can pass None to extend with nulls. - n - The number of additional values that will be added. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1, 2, 3]}) - >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) - shape: (5, 1) - ┌────────┐ - │ values │ - │ --- │ - │ i64 │ - ╞════════╡ - │ 0 │ - │ 1 │ - │ 2 │ - │ 99 │ - │ 99 │ - └────────┘ - - ''' - def value_counts(self) -> Self: - ''' - Count the occurrences of unique values. - - Parameters - ---------- - sort - Sort the output by count in descending order. - If set to `False` (default), the order of the output is random. - parallel - Execute the computation in parallel. - - .. note:: - This option should likely not be enabled in a group by context, - as the computation is already parallelized per group. - - Returns - ------- - Expr - Expression of data type :class:`Struct` with mapping of unique values to - their count. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} - ... ) - >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT - shape: (3, 1) - ┌─────────────┐ - │ color │ - │ --- │ - │ struct[2] │ - ╞═════════════╡ - │ {"red",2} │ - │ {"green",1} │ - │ {"blue",3} │ - └─────────────┘ - - Sort the output by count. - - >>> df.select(pl.col("color").value_counts(sort=True)) - shape: (3, 1) - ┌─────────────┐ - │ color │ - │ --- │ - │ struct[2] │ - ╞═════════════╡ - │ {"blue",3} │ - │ {"red",2} │ - │ {"green",1} │ - └─────────────┘ - - ''' - def unique_counts(self) -> Self: - ''' - Return a count of the unique values in the order of appearance. - - This method differs from `value_counts` in that it does not return the - values, only the counts and might be faster - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "id": ["a", "b", "b", "c", "c", "c"], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("id").unique_counts(), - ... ] - ... ) - shape: (3, 1) - ┌─────┐ - │ id │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - ''' - def log(self, base: float = ...) -> Self: - ''' - Compute the logarithm to a given base. - - Parameters - ---------- - base - Given base, defaults to `e` - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").log(base=2)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.0 │ - │ 1.0 │ - │ 1.584963 │ - └──────────┘ - - ''' - def log1p(self) -> Self: - ''' - Compute the natural logarithm of each element plus one. - - This computes `log(1 + x)` but is more numerically stable for `x` close to zero. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").log1p()) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.693147 │ - │ 1.098612 │ - │ 1.386294 │ - └──────────┘ - - ''' - def entropy(self, base: float = ...) -> Self: - ''' - Computes the entropy. - - Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. - - Parameters - ---------- - base - Given base, defaults to `e` - normalize - Normalize pk if it doesn\'t sum to 1. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").entropy(base=2)) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.459148 │ - └──────────┘ - >>> df.select(pl.col("a").entropy(base=2, normalize=False)) - shape: (1, 1) - ┌───────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═══════════╡ - │ -6.754888 │ - └───────────┘ - - ''' - def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: - ''' - Run an expression over a sliding window that increases `1` slot every iteration. - - Parameters - ---------- - expr - Expression to evaluate - min_periods - Number of valid values there should be in the window before the expression - is evaluated. valid values = `length - null_count` - parallel - Run in parallel. Don\'t do this in a group by or another operation that - already has much parallelization. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - This can be really slow as it can have `O(n^2)` complexity. Don\'t use this - for operations that visit all elements. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) - >>> df.select( - ... [ - ... pl.col("values").cumulative_eval( - ... pl.element().first() - pl.element().last() ** 2 - ... ) - ... ] - ... ) - shape: (5, 1) - ┌────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞════════╡ - │ 0.0 │ - │ -3.0 │ - │ -8.0 │ - │ -15.0 │ - │ -24.0 │ - └────────┘ - - ''' - def set_sorted(self) -> Self: - ''' - Flags the expression as \'sorted\'. - - Enables downstream code to user fast paths for sorted arrays. - - Parameters - ---------- - descending - Whether the `Series` order is descending. - - Warnings - -------- - This can lead to incorrect results if this `Series` is not sorted!! - Use with care! - - Examples - -------- - >>> df = pl.DataFrame({"values": [1, 2, 3]}) - >>> df.select(pl.col("values").set_sorted().max()) - shape: (1, 1) - ┌────────┐ - │ values │ - │ --- │ - │ i64 │ - ╞════════╡ - │ 3 │ - └────────┘ - - ''' - def shrink_dtype(self) -> Self: - ''' - Shrink numeric columns to the minimal required datatype. - - Shrink to the dtype needed to fit the extrema of this [`Series`]. - This can be used to reduce memory pressure. - - Examples - -------- - >>> pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": [1, 2, 2 << 32], - ... "c": [-1, 2, 1 << 30], - ... "d": [-112, 2, 112], - ... "e": [-112, 2, 129], - ... "f": ["a", "b", "c"], - ... "g": [0.1, 1.32, 0.12], - ... "h": [True, None, False], - ... } - ... ).select(pl.all().shrink_dtype()) - shape: (3, 8) - ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ - ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ - │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ - │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ - │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ - └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ - - ''' - def cache(self) -> Self: - """ - Cache this expression so that it only is executed once per context. - - .. deprecated:: 0.18.9 - This method now does nothing. It has been superseded by the - `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically - caches expressions that are equal. - - """ - def replace(self, mapping: dict[Any, Any]) -> Self: - ''' - Replace values according to the given mapping. - - Needs a global string cache for lazily evaluated queries on columns of - type `Categorical`. - - Parameters - ---------- - mapping - Mapping of values to their replacement. - default - Value to use when the mapping does not contain the lookup value. - Defaults to keeping the original value. Accepts expression input. - Non-expression inputs are parsed as literals. - return_dtype - Set return dtype to override automatic return dtype determination. - - See Also - -------- - str.replace - - Examples - -------- - Replace a single value by another value. Values not in the mapping remain - unchanged. - - >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) - >>> df.with_columns(pl.col("a").replace({2: 100}).alias("replaced")) - shape: (4, 2) - ┌─────┬──────────┐ - │ a ┆ replaced │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════════╡ - │ 1 ┆ 1 │ - │ 2 ┆ 100 │ - │ 2 ┆ 100 │ - │ 3 ┆ 3 │ - └─────┴──────────┘ - - Replace multiple values. Specify a default to set values not in the given map - to the default value. - - >>> df = pl.DataFrame({"country_code": ["FR", "ES", "DE", None]}) - >>> country_code_map = { - ... "CA": "Canada", - ... "DE": "Germany", - ... "FR": "France", - ... None: "unspecified", - ... } - >>> df.with_columns( - ... pl.col("country_code") - ... .replace(country_code_map, default=None) - ... .alias("replaced") - ... ) - shape: (4, 2) - ┌──────────────┬─────────────┐ - │ country_code ┆ replaced │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞══════════════╪═════════════╡ - │ FR ┆ France │ - │ ES ┆ null │ - │ DE ┆ Germany │ - │ null ┆ unspecified │ - └──────────────┴─────────────┘ - - The return type can be overridden with the `return_dtype` argument. - - >>> df = df.with_row_count() - >>> df.select( - ... "row_nr", - ... pl.col("row_nr") - ... .replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) - ... .alias("replaced"), - ... ) - shape: (4, 2) - ┌────────┬──────────┐ - │ row_nr ┆ replaced │ - │ --- ┆ --- │ - │ u32 ┆ u8 │ - ╞════════╪══════════╡ - │ 0 ┆ 0 │ - │ 1 ┆ 10 │ - │ 2 ┆ 20 │ - │ 3 ┆ 0 │ - └────────┴──────────┘ - - To reference other columns as a `default` value, a struct column must be - constructed first. The first field must be the column in which values are - replaced. The other columns can be used in the default expression. - - >>> df.with_columns( - ... pl.struct("country_code", "row_nr") - ... .replace( - ... mapping=country_code_map, - ... default=pl.col("row_nr").cast(pl.Utf8), - ... ) - ... .alias("replaced") - ... ) - shape: (4, 3) - ┌────────┬──────────────┬─────────────┐ - │ row_nr ┆ country_code ┆ replaced │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ str ┆ str │ - ╞════════╪══════════════╪═════════════╡ - │ 0 ┆ FR ┆ France │ - │ 1 ┆ ES ┆ 1 │ - │ 2 ┆ DE ┆ Germany │ - │ 3 ┆ null ┆ unspecified │ - └────────┴──────────────┴─────────────┘ - ''' - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: - """ - Apply a custom python function to a Series or sequence of Series. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Expr.map_batches`. - - Parameters - ---------- - function - Lambda/ function to apply. - return_dtype - Dtype of the output Series. - agg_list - Aggregate list - - """ - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - """ - Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Expr.map_elements`. - - Parameters - ---------- - function - Lambda/ function to apply. - return_dtype - Dtype of the output Series. - If not set, the dtype will be - `polars.Unknown`. - skip_nulls - Don't apply the function over values - that contain nulls. This is faster. - pass_name - Pass the Series name to the custom function - This is more expensive. - strategy : {'thread_local', 'threading'} - This functionality is in `alpha` stage. This may be removed - /changed without it being considered a breaking change. - - - 'thread_local': run the python function on a single thread. - - 'threading': run the python function on separate threads. Use with - care as this can slow performance. This might only speed up - your code if the amount of work per element is significant - and the python function releases the GIL (e.g. via calling - a c function) - - """ - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - """ - Apply a custom rolling window function. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Expr.rolling_map`. - - Parameters - ---------- - function - Aggregation function - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - """ - def is_first(self) -> Self: - """ - Return a boolean mask indicating the first occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Expr.is_first_distinct`. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - """ - def is_last(self) -> Self: - """ - Return a boolean mask indicating the last occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Expr.is_last_distinct`. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - """ - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: - """ - Clip (limit) the values in an array to a `min` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - lower_bound - Lower bound. - - """ - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: - """ - Clip (limit) the values in an array to a `max` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - upper_bound - Upper bound. - - """ - def shift_and_fill(self, fill_value: IntoExpr) -> Self: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - Fill None values with the result of this expression. - n - Number of places to shift (may be negative). - - """ - def register_plugin(self) -> Self: - """ - Register a shared library as a plugin. - - .. warning:: - This is highly unsafe as this will call the C function - loaded by `lib::symbol`. - - The parameters you give dictate how polars will deal - with the function. Make sure they are correct! - - .. note:: - This functionality is unstable and may change without it - being considered breaking. - - Parameters - ---------- - lib - Library to load. - symbol - Function to load. - args - Arguments (other than self) passed to this function. - These arguments have to be of type Expression. - kwargs - Non-expression arguments. They must be JSON serializable. - is_elementwise - If the function only operates on scalars - this will trigger fast paths. - input_wildcard_expansion - Expand expressions as input of this function. - returns_scalar - Automatically explode on unit length if it ran as final aggregation. - this is the case for aggregations like `sum`, `min`, `covariance` etc. - cast_to_supertypes - Cast the input datatypes to their supertype. - pass_name_to_apply - if set, then the `Series` passed to the function in the group_by operation - will ensure the name is set. This is an extra heap allocation per group. - changes_length - For example a `unique` or a `slice` - - """ - def _register_plugin(self) -> Self: ... - def take_every(self, n: int) -> Self: - """ - Take every nth value in the Series and return as a new Series. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: - """ - Take values by index. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather`. - - Parameters - ---------- - indices - An expression that leads to a UInt32 dtyped Series. - """ - def cumsum(self) -> Self: - """ - Get an array with the cumulative sum computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_sum`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cumprod(self) -> Self: - """ - Get an array with the cumulative product computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_prod`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cummin(self) -> Self: - """ - Get an array with the cumulative min computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_min`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cummax(self) -> Self: - """ - Get an array with the cumulative max computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_max`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cumcount(self) -> Self: - """ - Get an array with the cumulative count computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_count`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def map_dict(self, mapping: dict[Any, Any]) -> Self: - """ - Replace values in column according to remapping dictionary. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`replace`. The default behavior - has changed to keep any values not present in the mapping unchanged. - Pass `default=None` to keep existing behavior. - - Parameters - ---------- - mapping - Dictionary containing the before/after values to map. - default - Value to use when the remapping dict does not contain the lookup value. - Accepts expression input. Non-expression inputs are parsed as literals. - Use `pl.first()`, to keep the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - - """ - @property - def bin(self): ... - @property - def cat(self): ... - @property - def dt(self): ... - @property - def list(self): ... - @property - def arr(self): ... - @property - def meta(self): ... - @property - def name(self): ... - @property - def str(self): ... - @property - def struct(self): ... -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: - """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/expr/expr.pyi similarity index 99% rename from polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/expr/expr rename to polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/expr/expr.pyi index 5131d44..dc9ff79 100644 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/expr/expr +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/expr/expr.pyi @@ -1,3 +1,4 @@ +#: version 0.19.18 import P import np as np import pl diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/lazyframe/frame deleted file mode 100644 index 561f5b2..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/lazyframe/frame +++ /dev/null @@ -1,4211 +0,0 @@ -import P -import np -import pa -from builtins import PyLazyFrame -from pathlib import Path -from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 -from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype -from polars.dependencies import dataframe_api_compat as dataframe_api_compat -from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud -from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy -from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath -from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence - -TYPE_CHECKING: bool -DTYPE_TEMPORAL_UNITS: frozenset -N_INFER_DEFAULT: int - -class LazyFrame: - _accessors: _ClassVar[set] = ... - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - @classmethod - def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: - """ - Lazily read from a CSV file or multiple files via glob patterns. - - Use `pl.scan_csv` to dispatch to this method. - - See Also - -------- - polars.io.scan_csv - - """ - @classmethod - def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: - """ - Lazily read from a parquet file or multiple files via glob patterns. - - Use `pl.scan_parquet` to dispatch to this method. - - See Also - -------- - polars.io.scan_parquet - - """ - @classmethod - def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: - """ - Lazily read from an Arrow IPC (Feather v2) file. - - Use `pl.scan_ipc` to dispatch to this method. - - See Also - -------- - polars.io.scan_ipc - - """ - @classmethod - def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: - """ - Lazily read from a newline delimited JSON file. - - Use `pl.scan_ndjson` to dispatch to this method. - - See Also - -------- - polars.io.scan_ndjson - - """ - @classmethod - def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: - """ - Read a logical plan from a JSON string to construct a LazyFrame. - - .. deprecated:: 0.18.12 - This method is deprecated. Convert the JSON string to `StringIO` - and then use `LazyFrame.deserialize`. - - Parameters - ---------- - json - String in JSON format. - - See Also - -------- - deserialize - - """ - @classmethod - def read_json(cls, source: str | Path | IOBase) -> Self: - """ - Read a logical plan from a JSON file to construct a LazyFrame. - - .. deprecated:: 0.18.12 - This class method has been renamed to `deserialize`. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - - See Also - -------- - deserialize - - """ - @classmethod - def deserialize(cls, source: str | Path | IOBase) -> Self: - ''' - Read a logical plan from a JSON file to construct a LazyFrame. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - - See Also - -------- - LazyFrame.serialize - - Examples - -------- - >>> import io - >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() - >>> json = lf.serialize() - >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 6 │ - └─────┘ - - ''' - def __dataframe_consortium_standard__(self) -> Any: - """ - Provide entry point to the Consortium DataFrame Standard API. - - This is developed and maintained outside of polars. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. - """ - def __bool__(self) -> NoReturn: ... - def _comparison_error(self, operator: str) -> NoReturn: ... - def __eq__(self, other: Any) -> NoReturn: ... - def __ne__(self, other: Any) -> NoReturn: ... - def __gt__(self, other: Any) -> NoReturn: ... - def __lt__(self, other: Any) -> NoReturn: ... - def __ge__(self, other: Any) -> NoReturn: ... - def __le__(self, other: Any) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def _repr_html_(self) -> str: ... - def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: - ''' - Serialize the logical plan of this LazyFrame to a file or string in JSON format. - - Parameters - ---------- - file - File path to which the result should be written. If set to `None` - (default), the output is returned as a string instead. - - See Also - -------- - LazyFrame.deserialize - - Examples - -------- - Serialize the logical plan into a JSON string. - - >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() - >>> json = lf.serialize() - >>> json - \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' - - The logical plan can later be deserialized back into a LazyFrame. - - >>> import io - >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 6 │ - └─────┘ - - ''' - def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: - """ - Serialize the logical plan of this LazyFrame to a file or string in JSON format. - - .. deprecated:: 0.18.12 - This method has been renamed to :func:`LazyFrame.serialize`. - - Parameters - ---------- - file - File path to which the result should be written. If set to `None` - (default), the output is returned as a string instead. - """ - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: - ''' - Offers a structured way to apply a sequence of user-defined functions (UDFs). - - Parameters - ---------- - function - Callable; will receive the frame as the first parameter, - followed by any given args/kwargs. - *args - Arguments to pass to the UDF. - **kwargs - Keyword arguments to pass to the UDF. - - Examples - -------- - >>> def cast_str_to_int(data, col_name): - ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) - ... - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": ["10", "20", "30", "40"], - ... } - ... ) - >>> lf.pipe(cast_str_to_int, col_name="b").collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 10 │ - │ 2 ┆ 20 │ - │ 3 ┆ 30 │ - │ 4 ┆ 40 │ - └─────┴─────┘ - - >>> lf = pl.LazyFrame( - ... { - ... "b": [1, 2], - ... "a": [3, 4], - ... } - ... ) - >>> lf.collect() - shape: (2, 2) - ┌─────┬─────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - └─────┴─────┘ - >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 1 │ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def explain(self) -> str: - ''' - Create a string representation of the query plan. - - Different optimizations can be turned on or off. - - Parameters - ---------- - optimized - Return an optimized query plan. Defaults to `True`. - If this is set to `True` the subsequent - optimization flags control which optimizations - run. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( - ... "a" - ... ).explain() # doctest: +SKIP - ''' - def show_graph(self) -> str | None: - ''' - Show a plot of the query plan. Note that you should have graphviz installed. - - Parameters - ---------- - optimized - Optimize the query plan. - show - Show the figure. - output_path - Write the figure to disk. - raw_output - Return dot syntax. This cannot be combined with `show` and/or `output_path`. - figsize - Passed to matplotlib if `show` == True. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( - ... "a" - ... ).show_graph() # doctest: +SKIP - - ''' - def inspect(self, fmt: str = ...) -> Self: - ''' - Inspect a node in the computation graph. - - Print the value that this node in the computation graph evaluates to and passes - on the value. - - Examples - -------- - >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) - >>> ( - ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) - ... .inspect() # print the node before the filter - ... .filter(pl.col("bar") == pl.col("foo")) - ... ) # doctest: +ELLIPSIS - - - ''' - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: - ''' - Sort the DataFrame by the given columns. - - Parameters - ---------- - by - Column(s) to sort by. Accepts expression input. Strings are parsed as column - names. - *more_by - Additional columns to sort by, specified as positional arguments. - descending - Sort in descending order. When sorting by multiple columns, can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - Examples - -------- - Pass a single column name to sort by that column. - - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, None], - ... "b": [6.0, 5.0, 4.0], - ... "c": ["a", "c", "b"], - ... } - ... ) - >>> lf.sort("a").collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - Sorting by expressions is also supported. - - >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - └──────┴─────┴─────┘ - - Sort by multiple columns by passing a list of columns. - - >>> lf.sort(["c", "a"], descending=True).collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - └──────┴─────┴─────┘ - - Or use positional arguments to sort by multiple columns in the same way. - - >>> lf.sort("c", "a", descending=[False, True]).collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - ''' - def top_k(self, k: int) -> Self: - ''' - Return the `k` largest elements. - - If \'descending=True` the smallest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might - be worse since this requires a stable search. - - See Also - -------- - bottom_k - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 largest values in column b. - - >>> lf.top_k(4, by="b").collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ a ┆ 2 │ - │ b ┆ 2 │ - │ b ┆ 1 │ - └─────┴─────┘ - - Get the rows which contain the 4 largest values when sorting on column b and a. - - >>> lf.top_k(4, by=["b", "a"]).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 2 │ - │ c ┆ 1 │ - └─────┴─────┘ - - ''' - def bottom_k(self, k: int) -> Self: - ''' - Return the `k` smallest elements. - - If \'descending=True` the largest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - See Also - -------- - top_k - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 smallest values in column b. - - >>> lf.bottom_k(4, by="b").collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 1 │ - │ a ┆ 1 │ - │ c ┆ 1 │ - │ a ┆ 2 │ - └─────┴─────┘ - - Get the rows which contain the 4 smallest values when sorting on column a and b. - - >>> lf.bottom_k(4, by=["a", "b"]).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ b ┆ 1 │ - │ b ┆ 2 │ - └─────┴─────┘ - - ''' - def profile(self) -> tuple[DataFrame, DataFrame]: - ''' - Profile a LazyFrame. - - This will run the query and return a tuple - containing the materialized DataFrame and a DataFrame that - contains profiling information of each node that is executed. - - The units of the timings are microseconds. - - Parameters - ---------- - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off (certain) optimizations. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - show_plot - Show a gantt chart of the profiling result - truncate_nodes - Truncate the label lengths in the gantt chart to this number of - characters. - figsize - matplotlib figsize of the profiling plot - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( - ... "a" - ... ).profile() # doctest: +SKIP - (shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘, - shape: (3, 3) - ┌─────────────────────────┬───────┬──────┐ - │ node ┆ start ┆ end │ - │ --- ┆ --- ┆ --- │ - │ str ┆ u64 ┆ u64 │ - ╞═════════════════════════╪═══════╪══════╡ - │ optimization ┆ 0 ┆ 5 │ - │ group_by_partitioned(a) ┆ 5 ┆ 470 │ - │ sort(a) ┆ 475 ┆ 1964 │ - └─────────────────────────┴───────┴──────┘) - - ''' - def collect(self) -> DataFrame: - ''' - Materialize this LazyFrame into a DataFrame. - - By default, all query optimizations are enabled. Individual optimizations may - be disabled by setting the corresponding parameter to `False`. - - Parameters - ---------- - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - no_optimization - Turn off (certain) optimizations. - streaming - Process the query in batches to handle larger-than-memory data. - If set to `False` (default), the entire query is processed in a single - batch. - - .. warning:: - This functionality is currently in an alpha state. - - .. note:: - Use :func:`explain` to see if Polars can process the query in streaming - mode. - - Returns - ------- - DataFrame - - See Also - -------- - fetch: Run the query on the first `n` rows only for debugging purposes. - explain : Print the query plan that is evaluated with collect. - profile : Collect the LazyFrame and time each node in the computation graph. - polars.collect_all : Collect multiple LazyFrames at the same time. - polars.Config.set_streaming_chunk_size : Set the size of streaming batches. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘ - - Collect in streaming mode - - >>> lf.group_by("a").agg(pl.all().sum()).collect( - ... streaming=True - ... ) # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘ - - ''' - def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: - ''' - Collect DataFrame asynchronously in thread pool. - - Collects into a DataFrame (like :func:`collect`), but instead of returning - DataFrame directly, they are scheduled to be collected inside thread pool, - while this method returns almost instantly. - - May be useful if you use gevent or asyncio and want to release control to other - greenlets/tasks while LazyFrames are being collected. - - Parameters - ---------- - gevent - Return wrapper to `gevent.event.AsyncResult` instead of Awaitable - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off (certain) optimizations. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Notes - ----- - In case of error `set_exception` is used on - `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - See Also - -------- - polars.collect_all : Collect multiple LazyFrames at the same time. - polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. - - Returns - ------- - If `gevent=False` (default) then returns awaitable. - - If `gevent=True` then returns wrapper that has - `.get(block=True, timeout=None)` method. - - Examples - -------- - >>> import asyncio - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> async def main(): - ... return await ( - ... lf.group_by("a", maintain_order=True) - ... .agg(pl.all().sum()) - ... .collect_async() - ... ) - ... - >>> asyncio.run(main()) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘ - ''' - def sink_parquet(self, path: str | Path) -> DataFrame: - ''' - Evaluate the query in streaming mode and write to a Parquet file. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} - Choose "zstd" for good compression performance. - Choose "lz4" for fast compression/decompression. - Choose "snappy" for more backwards compatibility guarantees - when you deal with older parquet readers. - compression_level - The level of compression to use. Higher compression means smaller files on - disk. - - - "gzip" : min-level: 0, max-level: 10. - - "brotli" : min-level: 0, max-level: 11. - - "zstd" : min-level: 1, max-level: 22. - statistics - Write statistics to the parquet headers. This requires extra compute. - row_group_size - Size of the row groups in number of rows. - If None (default), the chunks of the `DataFrame` are - used. Writing in smaller chunks may reduce memory pressure and improve - writing speeds. - data_pagesize_limit - Size limit of individual data pages. - If not set defaults to 1024 * 1024 bytes - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - no_optimization - Turn off (certain) optimizations. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_parquet("out.parquet") # doctest: +SKIP - - ''' - def sink_ipc(self, path: str | Path) -> DataFrame: - ''' - Evaluate the query in streaming mode and write to an IPC file. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - compression : {\'lz4\', \'zstd\'} - Choose "zstd" for good compression performance. - Choose "lz4" for fast compression/decompression. - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - no_optimization - Turn off (certain) optimizations. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_ipc("out.arrow") # doctest: +SKIP - - ''' - def sink_csv(self, path: str | Path) -> DataFrame: - ''' - Evaluate the query in streaming mode and write to a CSV file. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - include_bom - Whether to include UTF-8 BOM in the CSV output. - include_header - Whether to include header in the CSV output. - separator - Separate CSV fields with this symbol. - line_terminator - String used to end each row. - quote_char - Byte to use as quoting character. - batch_size - Number of rows that will be processed per thread. - datetime_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. If no format specified, the default fractional-second - precision is inferred from the maximum timeunit found in the frame\'s - Datetime cols (if any). - date_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - time_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - float_precision - Number of decimal places to write, applied to both `Float32` and - `Float64` datatypes. - null_value - A string representing null values (defaulting to the empty string). - quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} - Determines the quoting strategy used. - - - necessary (default): This puts quotes around fields only when necessary. - They are necessary when fields contain a quote, - delimiter or record terminator. - Quotes are also necessary when writing an empty record - (which is indistinguishable from a record with one empty field). - This is the default. - - always: This puts quotes around every field. Always. - - never: This never puts quotes around fields, even if that results in - invalid CSV data (e.g.: by not quoting strings containing the - separator). - - non_numeric: This puts quotes around all fields that are non-numeric. - Namely, when writing a field that does not parse as a valid float - or integer, then quotes will be used even if they aren`t strictly - necessary. - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - no_optimization - Turn off (certain) optimizations. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_csv("out.csv") # doctest: +SKIP - - ''' - def sink_ndjson(self, path: str | Path) -> DataFrame: - ''' - Persists a LazyFrame at the provided path. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off (certain) optimizations. - slice_pushdown - Slice pushdown optimization. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_json("out.json") # doctest: +SKIP - - ''' - def _set_sink_optimizations(self) -> PyLazyFrame: ... - def fetch(self, n_rows: int = ...) -> DataFrame: - ''' - Collect a small number of rows for debugging purposes. - - Parameters - ---------- - n_rows - Collect n_rows from the data sources. - type_coercion - Run type coercion optimization. - predicate_pushdown - Run predicate pushdown optimization. - projection_pushdown - Run projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off optimizations. - slice_pushdown - Slice pushdown optimization - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Notes - ----- - This is similar to a :func:`collect` operation, but it overwrites the number of - rows read by *every* scan operation. Be aware that `fetch` does not guarantee - the final number of rows in the DataFrame. Filters, join operations and fewer - rows being available in the scanned data will all influence the final number - of rows (joins are especially susceptible to this, and may return no data - at all if `n_rows` is too small as the join keys may not be present). - - Warnings - -------- - This is strictly a utility function that can help to debug queries using a - smaller number of rows, and should *not* be used in production code. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 6 │ - │ b ┆ 2 ┆ 5 │ - └─────┴─────┴─────┘ - - ''' - def lazy(self) -> Self: - ''' - Return lazy representation, i.e. itself. - - Useful for writing code that expects either a :class:`DataFrame` or - :class:`LazyFrame`. - - Returns - ------- - LazyFrame - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> lf.lazy() # doctest: +ELLIPSIS - - - ''' - def cache(self) -> Self: - """Cache the result once the execution of the physical plan hits this node.""" - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: - ''' - Cast LazyFrame column(s) to the specified dtype(s). - - Parameters - ---------- - dtypes - Mapping of column names (or selector) to dtypes, or a single dtype - to which all columns will be cast. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> from datetime import date - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], - ... } - ... ) - - Cast specific frame columns to the specified dtypes: - - >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ u8 ┆ date │ - ╞═════╪═════╪════════════╡ - │ 1.0 ┆ 6 ┆ 2020-01-02 │ - │ 2.0 ┆ 7 ┆ 2021-03-04 │ - │ 3.0 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - Cast all frame columns to the specified dtype: - - >>> lf.cast(pl.Utf8).collect().to_dict(as_series=False) - {\'foo\': [\'1\', \'2\', \'3\'], - \'bar\': [\'6.0\', \'7.0\', \'8.0\'], - \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} - - Use selectors to define the columns being cast: - - >>> import polars.selectors as cs - >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ str │ - ╞═════╪═════╪════════════╡ - │ 1 ┆ 6 ┆ 2020-01-02 │ - │ 2 ┆ 7 ┆ 2021-03-04 │ - │ 3 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - ''' - def clear(self, n: int = ...) -> LazyFrame: - ''' - Create an empty copy of the current LazyFrame, with zero to \'n\' rows. - - Returns a copy with an identical schema but no data. - - Parameters - ---------- - n - Number of (empty) rows to return in the cleared frame. - - See Also - -------- - clone : Cheap deepcopy/clone. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> lf.clear().fetch() - shape: (0, 3) - ┌─────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞═════╪═════╪══════╡ - └─────┴─────┴──────┘ - - >>> lf.clear(2).fetch() - shape: (2, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪══════╪══════╡ - │ null ┆ null ┆ null │ - │ null ┆ null ┆ null │ - └──────┴──────┴──────┘ - - ''' - def clone(self) -> Self: - ''' - Create a copy of this LazyFrame. - - This is a cheap operation that does not copy data. - - See Also - -------- - clear : Create an empty copy of the current LazyFrame, with identical - schema but no data. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> lf.clone() # doctest: +ELLIPSIS - - - ''' - def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: - ''' - Filter the rows in the LazyFrame based on a predicate expression. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - predicates - Expression that evaluates to a boolean Series. - constraints - Column filters. Use name=value to filter column name by the supplied value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - - Filter on one condition: - - >>> lf.filter(pl.col("foo") > 1).collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Filter on multiple conditions: - - >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Provide multiple filters using `*args` syntax: - - >>> lf.filter( - ... pl.col("foo") == 1, - ... pl.col("ham") == "a", - ... ).collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Provide multiple filters using `**kwargs` syntax: - - >>> lf.filter(foo=1, ham="a").collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Filter on an OR condition: - - >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - ''' - Select columns from this LazyFrame. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Examples - -------- - Pass the name of a column to select that column. - - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.select("foo").collect() - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - Multiple columns can be selected by passing a list of column names. - - >>> lf.select(["foo", "bar"]).collect() - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 6 │ - │ 2 ┆ 7 │ - │ 3 ┆ 8 │ - └─────┴─────┘ - - Multiple columns can also be selected using positional arguments instead of a - list. Expressions are also accepted. - - >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - └─────┴─────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> lf.select( - ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) - ... ).collect() - shape: (3, 1) - ┌───────────┐ - │ threshold │ - │ --- │ - │ i32 │ - ╞═══════════╡ - │ 0 │ - │ 0 │ - │ 10 │ - └───────────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... lf.select( - ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), - ... ).collect() - ... - shape: (3, 1) - ┌───────────┐ - │ is_odd │ - │ --- │ - │ struct[2] │ - ╞═══════════╡ - │ {1,0} │ - │ {0,1} │ - │ {1,0} │ - └───────────┘ - - ''' - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - """ - Select columns from this LazyFrame. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - See Also - -------- - select - - """ - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: - ''' - Start a group by operation. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Setting this to `True` blocks the possibility - to run on the streaming engine. - - Examples - -------- - Group by one column and call `agg` to compute the grouped sum of another - column. - - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "c"], - ... "b": [1, 2, 1, 3, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 2 │ - │ b ┆ 5 │ - │ c ┆ 3 │ - └─────┴─────┘ - - Set `maintain_order=True` to ensure the order of the groups is consistent with - the input. - - >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() - shape: (3, 2) - ┌─────┬───────────┐ - │ a ┆ c │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════╪═══════════╡ - │ a ┆ [5, 3] │ - │ b ┆ [4, 2] │ - │ c ┆ [1] │ - └─────┴───────────┘ - - Group by multiple columns by passing a list of column names. - - >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - - Or use positional arguments to group by multiple columns in the same way. - Expressions are also accepted. - - >>> lf.group_by("a", pl.col("b") // 2).agg( - ... pl.col("c").mean() - ... ).collect() # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ f64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 0 ┆ 4.0 │ - │ b ┆ 1 ┆ 3.0 │ - │ c ┆ 1 ┆ 1.0 │ - └─────┴─────┴─────┘ - - ''' - def rolling(self, index_column: IntoExpr) -> LazyGroupBy: - ''' - Create rolling groups based on a time, Int32, or Int64 column. - - Different from a `dynamic_group_by` the windows are now determined by the - individual values and are not of constant intervals. For constant intervals - use :func:`LazyFrame.group_by_dynamic`. - - If you have a time series ``, then by default the - windows created will be - - * (t_0 - period, t_0] - * (t_1 - period, t_1] - * ... - * (t_n - period, t_n] - - whereas if you pass a non-default `offset`, then the windows will be - - * (t_0 + offset, t_0 + offset + period] - * (t_1 + offset, t_1 + offset + period] - * ... - * (t_n + offset, t_n + offset + period] - - The `period` and `offset` arguments are created either from a timedelta, or - by using the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a rolling operation on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - group_by_dynamic - - Examples - -------- - >>> dates = [ - ... "2020-01-01 13:45:48", - ... "2020-01-01 16:42:13", - ... "2020-01-01 16:45:09", - ... "2020-01-02 18:12:48", - ... "2020-01-03 19:45:32", - ... "2020-01-08 23:16:43", - ... ] - >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( - ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() - ... ) - >>> out = ( - ... df.rolling(index_column="dt", period="2d") - ... .agg( - ... pl.sum("a").alias("sum_a"), - ... pl.min("a").alias("min_a"), - ... pl.max("a").alias("max_a"), - ... ) - ... .collect() - ... ) - >>> out - shape: (6, 4) - ┌─────────────────────┬───────┬───────┬───────┐ - │ dt ┆ sum_a ┆ min_a ┆ max_a │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞═════════════════════╪═══════╪═══════╪═══════╡ - │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ - │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ - │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ - │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ - │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ - │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ - └─────────────────────┴───────┴───────┴───────┘ - - ''' - def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - Time windows are calculated and rows are assigned to windows. Different from a - normal group by is that a row can be member of multiple groups. - By default, the windows look like: - - - [start, start + period) - - [start + every, start + every + period) - - [start + 2*every, start + 2*every + period) - - ... - - where `start` is determined by `start_by`, `offset`, and `every` (see parameter - descriptions below). - - .. warning:: - The index column must be sorted in ascending order. If `by` is passed, then - the index column must be sorted in ascending order within each group. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - - .. deprecated:: 0.19.4 - Use `label` instead. - include_boundaries - Add the lower and upper bound of the window to the "_lower_boundary" and - "_upper_boundary" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - label : {\'left\', \'right\', \'datapoint\'} - Define which label to use for the window: - - - \'left\': lower boundary of the window - - \'right\': upper boundary of the window - - \'datapoint\': the first value of the index column in the given window. - If you don\'t need the label to be at one of the boundaries, choose this - option for maximum performance - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - rolling - - Notes - ----- - 1) If you\'re coming from pandas, then - - .. code-block:: python - - # polars - df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) - - is equivalent to - - .. code-block:: python - - # pandas - df.set_index("ts").resample("D")["value"].sum().reset_index() - - though note that, unlike pandas, polars doesn\'t add extra rows for empty - windows. If you need `index_column` to be evenly spaced, then please combine - with :func:`DataFrame.upsample`. - - 2) The `every`, `period` and `offset` arguments are created with - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a group_by_dynamic on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Examples - -------- - >>> from datetime import datetime - >>> lf = pl.LazyFrame( - ... { - ... "time": pl.datetime_range( - ... start=datetime(2021, 12, 16), - ... end=datetime(2021, 12, 16, 3), - ... interval="30m", - ... eager=True, - ... ), - ... "n": range(7), - ... } - ... ) - >>> lf.collect() - shape: (7, 2) - ┌─────────────────────┬─────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i64 │ - ╞═════════════════════╪═════╡ - │ 2021-12-16 00:00:00 ┆ 0 │ - │ 2021-12-16 00:30:00 ┆ 1 │ - │ 2021-12-16 01:00:00 ┆ 2 │ - │ 2021-12-16 01:30:00 ┆ 3 │ - │ 2021-12-16 02:00:00 ┆ 4 │ - │ 2021-12-16 02:30:00 ┆ 5 │ - │ 2021-12-16 03:00:00 ┆ 6 │ - └─────────────────────┴─────┘ - - Group by windows of 1 hour starting at 2021-12-16 00:00:00. - - >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( - ... pl.col("n") - ... ).collect() - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [1, 2] │ - │ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ 2021-12-16 02:00:00 ┆ [5, 6] │ - └─────────────────────┴───────────┘ - - The window boundaries can also be added to the aggregation result - - >>> lf.group_by_dynamic( - ... "time", every="1h", include_boundaries=True, closed="right" - ... ).agg(pl.col("n").mean()).collect() - shape: (4, 4) - ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ - │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ - ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ - │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ - │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ - │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ - │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ - └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ - - When closed="left", the window excludes the right end of interval: - [lower_bound, upper_bound) - - >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( - ... pl.col("n") - ... ).collect() - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-16 00:00:00 ┆ [0, 1] │ - │ 2021-12-16 01:00:00 ┆ [2, 3] │ - │ 2021-12-16 02:00:00 ┆ [4, 5] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - When closed="both" the time values at the window boundaries belong to 2 groups. - - >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( - ... pl.col("n") - ... ).collect() - shape: (5, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ - │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - Dynamic group bys can also be combined with grouping on normal keys - - >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) - >>> lf.collect() - shape: (7, 3) - ┌─────────────────────┬─────┬────────┐ - │ time ┆ n ┆ groups │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ str │ - ╞═════════════════════╪═════╪════════╡ - │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ - │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ - │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ - │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ - │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ - │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ - │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ - └─────────────────────┴─────┴────────┘ - >>> lf.group_by_dynamic( - ... "time", - ... every="1h", - ... closed="both", - ... by="groups", - ... include_boundaries=True, - ... ).agg(pl.col("n")).collect() - shape: (7, 5) - ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ - │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ - ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ - │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ - │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ - │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ - │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ - │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ - └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ - - Dynamic group by on an index column - - >>> lf = pl.LazyFrame( - ... { - ... "idx": pl.int_range(0, 6, eager=True), - ... "A": ["A", "A", "B", "B", "B", "C"], - ... } - ... ) - >>> lf.group_by_dynamic( - ... "idx", - ... every="2i", - ... period="3i", - ... include_boundaries=True, - ... closed="right", - ... ).agg(pl.col("A").alias("A_agg_list")).collect() - shape: (4, 4) - ┌─────────────────┬─────────────────┬─────┬─────────────────┐ - │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 ┆ list[str] │ - ╞═════════════════╪═════════════════╪═════╪═════════════════╡ - │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ - │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ - │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ - │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ - └─────────────────┴─────────────────┴─────┴─────────────────┘ - - ''' - def join_asof(self, other: LazyFrame) -> Self: - ''' - Perform an asof join. - - This is similar to a left-join except that we match on nearest key rather than - equal keys. - - Both DataFrames must be sorted by the join_asof key. - - For each row in the left DataFrame: - - - A "backward" search selects the last row in the right DataFrame whose - \'on\' key is less than or equal to the left\'s key. - - - A "forward" search selects the first row in the right DataFrame whose - \'on\' key is greater than or equal to the left\'s key. - - A "nearest" search selects the last row in the right DataFrame whose value - is nearest to the left\'s key. String keys are not currently supported for a - nearest search. - - The default is "backward". - - Parameters - ---------- - other - Lazy DataFrame to join with. - left_on - Join column of the left DataFrame. - right_on - Join column of the right DataFrame. - on - Join column of both DataFrames. If set, `left_on` and `right_on` should be - None. - by - Join on these columns before doing asof join. - by_left - Join on these columns before doing asof join. - by_right - Join on these columns before doing asof join. - strategy : {\'backward\', \'forward\', \'nearest\'} - Join strategy. - suffix - Suffix to append to columns with a duplicate name. - tolerance - Numeric tolerance. By setting this the join will only be done if the near - keys are within this distance. If an asof join is done on columns of dtype - "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta - object or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - allow_parallel - Allow the physical plan to optionally evaluate the computation of both - DataFrames up to the join in parallel. - force_parallel - Force the physical plan to evaluate the computation of both DataFrames up to - the join in parallel. - - Examples - -------- - >>> from datetime import datetime - >>> gdp = pl.LazyFrame( - ... { - ... "date": [ - ... datetime(2016, 1, 1), - ... datetime(2017, 1, 1), - ... datetime(2018, 1, 1), - ... datetime(2019, 1, 1), - ... ], # note record date: Jan 1st (sorted!) - ... "gdp": [4164, 4411, 4566, 4696], - ... } - ... ).set_sorted("date") - >>> population = pl.LazyFrame( - ... { - ... "date": [ - ... datetime(2016, 5, 12), - ... datetime(2017, 5, 12), - ... datetime(2018, 5, 12), - ... datetime(2019, 5, 12), - ... ], # note record date: May 12th (sorted!) - ... "population": [82.19, 82.66, 83.12, 83.52], - ... } - ... ).set_sorted("date") - >>> population.join_asof(gdp, on="date", strategy="backward").collect() - shape: (4, 3) - ┌─────────────────────┬────────────┬──────┐ - │ date ┆ population ┆ gdp │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ f64 ┆ i64 │ - ╞═════════════════════╪════════════╪══════╡ - │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ - │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ - │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ - │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ - └─────────────────────┴────────────┴──────┘ - - ''' - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: - ''' - Add a join operation to the Logical Plan. - - Parameters - ---------- - other - Lazy DataFrame to join with. - on - Join column of both DataFrames. If set, `left_on` and `right_on` should be - None. - how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} - Join strategy. - - .. note:: - A left join preserves the row order of the left DataFrame. - left_on - Join column of the left DataFrame. - right_on - Join column of the right DataFrame. - suffix - Suffix to append to columns with a duplicate name. - validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} - Checks if join is of specified type. - - * *many_to_many* - “m:m”: default, does not result in checks - * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets - * *one_to_many* - “1:m”: check if join keys are unique in left dataset - * *many_to_one* - “m:1”: check if join keys are unique in right dataset - - .. note:: - - - This is currently not supported the streaming engine. - - This is only supported when joined by single columns. - allow_parallel - Allow the physical plan to optionally evaluate the computation of both - DataFrames up to the join in parallel. - force_parallel - Force the physical plan to evaluate the computation of both DataFrames up to - the join in parallel. - - See Also - -------- - join_asof - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> other_lf = pl.LazyFrame( - ... { - ... "apple": ["x", "y", "z"], - ... "ham": ["a", "b", "d"], - ... } - ... ) - >>> lf.join(other_lf, on="ham").collect() - shape: (2, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - └─────┴─────┴─────┴───────┘ - >>> lf.join(other_lf, on="ham", how="outer").collect() - shape: (4, 4) - ┌──────┬──────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞══════╪══════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ null ┆ null ┆ d ┆ z │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └──────┴──────┴─────┴───────┘ - >>> lf.join(other_lf, on="ham", how="left").collect() - shape: (3, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └─────┴─────┴─────┴───────┘ - >>> lf.join(other_lf, on="ham", how="semi").collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 7.0 ┆ b │ - └─────┴─────┴─────┘ - >>> lf.join(other_lf, on="ham", how="anti").collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - ''' - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - LazyFrame - A new LazyFrame with the columns added. - - Notes - ----- - Creating a new LazyFrame using this method does not create a new copy of - existing data. - - Examples - -------- - Pass an expression to add it as a new column. - - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() - shape: (4, 4) - ┌─────┬──────┬───────┬──────┐ - │ a ┆ b ┆ c ┆ a^2 │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 │ - ╞═════╪══════╪═══════╪══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ - └─────┴──────┴───────┴──────┘ - - Added columns will replace existing columns with the same name. - - >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() - shape: (4, 3) - ┌─────┬──────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╡ - │ 1.0 ┆ 0.5 ┆ true │ - │ 2.0 ┆ 4.0 ┆ true │ - │ 3.0 ┆ 10.0 ┆ false │ - │ 4.0 ┆ 13.0 ┆ true │ - └─────┴──────┴───────┘ - - Multiple columns can be added by passing a list of expressions. - - >>> lf.with_columns( - ... [ - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ] - ... ).collect() - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Multiple columns also can be added using positional arguments instead of a list. - - >>> lf.with_columns( - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ).collect() - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> lf.with_columns( - ... ab=pl.col("a") * pl.col("b"), - ... not_c=pl.col("c").not_(), - ... ).collect() - shape: (4, 5) - ┌─────┬──────┬───────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ ab ┆ not_c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ - └─────┴──────┴───────┴──────┴───────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... lf.drop("c").with_columns( - ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), - ... ).collect() - ... - shape: (4, 3) - ┌─────┬──────┬─────────────┐ - │ a ┆ b ┆ diffs │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ struct[2] │ - ╞═════╪══════╪═════════════╡ - │ 1 ┆ 0.5 ┆ {null,null} │ - │ 2 ┆ 4.0 ┆ {1,3.5} │ - │ 3 ┆ 10.0 ┆ {1,6.0} │ - │ 4 ┆ 13.0 ┆ {1,3.0} │ - └─────┴──────┴─────────────┘ - - ''' - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - """ - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - LazyFrame - A new LazyFrame with the columns added. - - See Also - -------- - with_columns - - """ - def with_context(self, other: Self | list[Self]) -> Self: - ''' - Add an external context to the computation graph. - - This allows expressions to also access columns from DataFrames - that are not part of this one. - - Parameters - ---------- - other - Lazy DataFrame to join with. - - Examples - -------- - >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) - >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) - >>> lf.with_context(lf_other).select( - ... pl.col("b") + pl.col("c").first() - ... ).collect() - shape: (3, 1) - ┌──────┐ - │ b │ - │ --- │ - │ str │ - ╞══════╡ - │ afoo │ - │ cfoo │ - │ null │ - └──────┘ - - Fill nulls with the median from another DataFrame: - - >>> train_lf = pl.LazyFrame( - ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} - ... ) - >>> test_lf = pl.LazyFrame( - ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} - ... ) - >>> test_lf.with_context( - ... train_lf.select(pl.all().name.suffix("_train")) - ... ).select( - ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) - ... ).collect() - shape: (3, 1) - ┌───────────┐ - │ feature_0 │ - │ --- │ - │ f64 │ - ╞═══════════╡ - │ -1.0 │ - │ 0.0 │ - │ 1.0 │ - └───────────┘ - - ''' - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: - ''' - Remove columns from the DataFrame. - - Parameters - ---------- - columns - Name of the column(s) that should be removed from the DataFrame. - *more_columns - Additional columns to drop, specified as positional arguments. - - Examples - -------- - Drop a single column by passing the name of that column. - - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.drop("ham").collect() - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪═════╡ - │ 1 ┆ 6.0 │ - │ 2 ┆ 7.0 │ - │ 3 ┆ 8.0 │ - └─────┴─────┘ - - Drop multiple columns by passing a selector. - - >>> import polars.selectors as cs - >>> lf.drop(cs.numeric()).collect() - shape: (3, 1) - ┌─────┐ - │ ham │ - │ --- │ - │ str │ - ╞═════╡ - │ a │ - │ b │ - │ c │ - └─────┘ - - Use positional arguments to drop multiple columns. - - >>> lf.drop("foo", "ham").collect() - shape: (3, 1) - ┌─────┐ - │ bar │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 6.0 │ - │ 7.0 │ - │ 8.0 │ - └─────┘ - - ''' - def rename(self, mapping: dict[str, str]) -> Self: - ''' - Rename column names. - - Parameters - ---------- - mapping - Key value pairs that map from old name to new name. - - Notes - ----- - If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), - polars will block projection and predicate pushdowns at this node. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.rename({"foo": "apple"}).collect() - shape: (3, 3) - ┌───────┬─────┬─────┐ - │ apple ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═══════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └───────┴─────┴─────┘ - - ''' - def reverse(self) -> Self: - ''' - Reverse the DataFrame. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "key": ["a", "b", "c"], - ... "val": [1, 2, 3], - ... } - ... ) - >>> lf.reverse().collect() - shape: (3, 2) - ┌─────┬─────┐ - │ key ┆ val │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ c ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 1 │ - └─────┴─────┘ - - ''' - def shift(self, n: int | IntoExprColumn = ...) -> Self: - ''' - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. Accepts expression input. - Non-expression inputs are parsed as literals. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [5, 6, 7, 8], - ... } - ... ) - >>> lf.shift().collect() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ null ┆ null │ - │ 1 ┆ 5 │ - │ 2 ┆ 6 │ - │ 3 ┆ 7 │ - └──────┴──────┘ - - Pass a negative value to shift in the opposite direction instead. - - >>> lf.shift(-2).collect() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ null ┆ null │ - │ null ┆ null │ - └──────┴──────┘ - - Specify `fill_value` to fill the resulting null values. - - >>> lf.shift(-2, fill_value=100).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ 100 ┆ 100 │ - │ 100 ┆ 100 │ - └─────┴─────┘ - - ''' - def slice(self, offset: int, length: int | None = ...) -> Self: - ''' - Get a slice of this DataFrame. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["x", "y", "z"], - ... "b": [1, 3, 5], - ... "c": [2, 4, 6], - ... } - ... ) - >>> lf.slice(1, 2).collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ y ┆ 3 ┆ 4 │ - │ z ┆ 5 ┆ 6 │ - └─────┴─────┴─────┘ - - ''' - def limit(self, n: int = ...) -> Self: - ''' - Get the first `n` rows. - - Alias for :func:`LazyFrame.head`. - - Parameters - ---------- - n - Number of rows to return. - - Notes - ----- - Consider using the :func:`fetch` operation if you only want to test your - query. The :func:`fetch` operation will load the first `n` rows at the scan - level, whereas the :func:`head`/:func:`limit` are applied at the end. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4, 5, 6], - ... "b": [7, 8, 9, 10, 11, 12], - ... } - ... ) - >>> lf.limit().collect() - shape: (5, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - │ 4 ┆ 10 │ - │ 5 ┆ 11 │ - └─────┴─────┘ - >>> lf.limit(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - └─────┴─────┘ - - ''' - def head(self, n: int = ...) -> Self: - ''' - Get the first `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Notes - ----- - Consider using the :func:`fetch` operation if you only want to test your - query. The :func:`fetch` operation will load the first `n` rows at the scan - level, whereas the :func:`head`/:func:`limit` are applied at the end. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4, 5, 6], - ... "b": [7, 8, 9, 10, 11, 12], - ... } - ... ) - >>> lf.head().collect() - shape: (5, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - │ 4 ┆ 10 │ - │ 5 ┆ 11 │ - └─────┴─────┘ - >>> lf.head(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - └─────┴─────┘ - - ''' - def tail(self, n: int = ...) -> Self: - ''' - Get the last `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4, 5, 6], - ... "b": [7, 8, 9, 10, 11, 12], - ... } - ... ) - >>> lf.tail().collect() - shape: (5, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - │ 4 ┆ 10 │ - │ 5 ┆ 11 │ - │ 6 ┆ 12 │ - └─────┴─────┘ - >>> lf.tail(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 5 ┆ 11 │ - │ 6 ┆ 12 │ - └─────┴─────┘ - - ''' - def last(self) -> Self: - ''' - Get the last row of the DataFrame. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> lf.last().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 5 ┆ 6 │ - └─────┴─────┘ - - ''' - def first(self) -> Self: - ''' - Get the first row of the DataFrame. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> lf.first().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 2 │ - └─────┴─────┘ - - ''' - def approx_n_unique(self) -> Self: - ''' - Approximate count of unique values. - - This is done using the HyperLogLog++ algorithm for cardinality estimation. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.approx_n_unique().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def approx_unique(self) -> Self: - """ - Approximate count of unique values. - - .. deprecated:: 0.18.12 - This method has been renamed to :func:`LazyFrame.approx_n_unique`. - - """ - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: - ''' - Add a column at index 0 that counts the rows. - - Parameters - ---------- - name - Name of the column to add. - offset - Start the row count at this offset. - - Warnings - -------- - This can have a negative effect on query performance. - This may, for instance, block predicate pushdown optimization. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> lf.with_row_count().collect() - shape: (3, 3) - ┌────────┬─────┬─────┐ - │ row_nr ┆ a ┆ b │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ i64 ┆ i64 │ - ╞════════╪═════╪═════╡ - │ 0 ┆ 1 ┆ 2 │ - │ 1 ┆ 3 ┆ 4 │ - │ 2 ┆ 5 ┆ 6 │ - └────────┴─────┴─────┘ - - ''' - def gather_every(self, n: int) -> Self: - ''' - Take every nth row in the LazyFrame and return as a new LazyFrame. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [5, 6, 7, 8], - ... } - ... ) - >>> lf.gather_every(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 5 │ - │ 3 ┆ 7 │ - └─────┴─────┘ - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: - ''' - Fill null values using the specified value or strategy. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - matches_supertype - Fill all matching supertypes of the fill `value` literal. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, None, 4], - ... "b": [0.5, 4, None, 13], - ... } - ... ) - >>> lf.fill_null(99).collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 99 ┆ 99.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - >>> lf.fill_null(strategy="forward").collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> lf.fill_null(strategy="max").collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> lf.fill_null(strategy="zero").collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 0 ┆ 0.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - ''' - def fill_nan(self, value: int | float | Expr | None) -> Self: - ''' - Fill floating point NaN values. - - Parameters - ---------- - value - Value to fill the NaN values with. - - Warnings - -------- - Note that floating point NaN (Not a Number) are not missing values! - To replace missing values, use :func:`fill_null` instead. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1.5, 2, float("nan"), 4], - ... "b": [0.5, 4, float("nan"), 13], - ... } - ... ) - >>> lf.fill_nan(99).collect() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪══════╡ - │ 1.5 ┆ 0.5 │ - │ 2.0 ┆ 4.0 │ - │ 99.0 ┆ 99.0 │ - │ 4.0 ┆ 13.0 │ - └──────┴──────┘ - - ''' - def std(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns in the LazyFrame to their standard deviation value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.std().collect() - shape: (1, 2) - ┌──────────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════════╪═════╡ - │ 1.290994 ┆ 0.5 │ - └──────────┴─────┘ - >>> lf.std(ddof=0).collect() - shape: (1, 2) - ┌──────────┬──────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════════╪══════════╡ - │ 1.118034 ┆ 0.433013 │ - └──────────┴──────────┘ - - ''' - def var(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns in the LazyFrame to their variance value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.var().collect() - shape: (1, 2) - ┌──────────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════════╪══════╡ - │ 1.666667 ┆ 0.25 │ - └──────────┴──────┘ - >>> lf.var(ddof=0).collect() - shape: (1, 2) - ┌──────┬────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪════════╡ - │ 1.25 ┆ 0.1875 │ - └──────┴────────┘ - - ''' - def max(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their maximum value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.max().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def min(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their minimum value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.min().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 1 │ - └─────┴─────┘ - - ''' - def sum(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their sum value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.sum().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 10 ┆ 5 │ - └─────┴─────┘ - - ''' - def mean(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their mean value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.mean().collect() - shape: (1, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════╡ - │ 2.5 ┆ 1.25 │ - └─────┴──────┘ - - ''' - def median(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their median value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.median().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 2.5 ┆ 1.0 │ - └─────┴─────┘ - - ''' - def null_count(self) -> Self: - ''' - Aggregate the columns in the LazyFrame as the sum of their null value count. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, None, 3], - ... "bar": [6, 7, None], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.null_count().collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ u32 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 1 ┆ 0 │ - └─────┴─────┴─────┘ - - ''' - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: - ''' - Aggregate the columns in the LazyFrame to their quantile value. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.quantile(0.7).collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 3.0 ┆ 1.0 │ - └─────┴─────┘ - - ''' - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: - ''' - Explode the DataFrame to long format by exploding the given columns. - - Parameters - ---------- - columns - Column names, expressions, or a selector defining them. The underlying - columns being exploded must be of List or Utf8 datatype. - *more_columns - Additional names of columns to explode, specified as positional arguments. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "letters": ["a", "a", "b", "c"], - ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], - ... } - ... ) - >>> lf.explode("numbers").collect() - shape: (8, 2) - ┌─────────┬─────────┐ - │ letters ┆ numbers │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════════╪═════════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ a ┆ 3 │ - │ b ┆ 4 │ - │ b ┆ 5 │ - │ c ┆ 6 │ - │ c ┆ 7 │ - │ c ┆ 8 │ - └─────────┴─────────┘ - - ''' - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: - ''' - Drop duplicate rows from this DataFrame. - - Parameters - ---------- - subset - Column name(s) or selector(s), to consider when identifying - duplicate rows. If set to `None` (default), use all columns. - keep : {\'first\', \'last\', \'any\', \'none\'} - Which of the duplicate rows to keep. - - * \'any\': Does not give any guarantee of which row is kept. - This allows more optimizations. - * \'none\': Don\'t keep duplicate rows. - * \'first\': Keep first unique row. - * \'last\': Keep last unique row. - maintain_order - Keep the same order as the original DataFrame. This is more expensive to - compute. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - Returns - ------- - LazyFrame - LazyFrame with unique rows. - - Warnings - -------- - This method will fail if there is a column of type `List` in the DataFrame or - subset. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3, 1], - ... "bar": ["a", "a", "a", "a"], - ... "ham": ["b", "b", "b", "b"], - ... } - ... ) - >>> lf.unique(maintain_order=True).collect() - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> lf.unique(keep="last", maintain_order=True).collect() - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - - ''' - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: - ''' - Drop all rows that contain null values. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - subset - Column name(s) for which null values are considered. - If set to `None` (default), use all columns. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, None, 8], - ... "ham": ["a", "b", None], - ... } - ... ) - - The default behavior of this method is to drop rows where any single - value of the row is null. - - >>> lf.drop_nulls().collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - This behaviour can be constrained to consider only a subset of columns, as - defined by name or with a selector. For example, dropping rows if there is - a null in any of the integer columns: - - >>> import polars.selectors as cs - >>> lf.drop_nulls(subset=cs.integer()).collect() - shape: (2, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ null │ - └─────┴─────┴──────┘ - - This method drops a row if any single value of the row is null. - - Below are some example snippets that show how you could drop null - values based on other conditions: - - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, None, None, None], - ... "b": [1, 2, None, 1], - ... "c": [1, None, None, 1], - ... } - ... ) - >>> lf.collect() - shape: (4, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪══════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ null ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴──────┴──────┘ - - Drop a row only if all values are null: - - >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() - shape: (3, 3) - ┌──────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪═════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴─────┴──────┘ - - ''' - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: - ''' - Unpivot a DataFrame from wide to long format. - - Optionally leaves identifiers set. - - This function is useful to massage a DataFrame into a format where one or more - columns are identifier variables (id_vars) while all other columns, considered - measured variables (value_vars), are "unpivoted" to the row axis leaving just - two non-identifier columns, \'variable\' and \'value\'. - - Parameters - ---------- - id_vars - Column(s) or selector(s) to use as identifier variables. - value_vars - Column(s) or selector(s) to use as values variables; if `value_vars` - is empty all columns that are not in `id_vars` will be used. - variable_name - Name to give to the `variable` column. Defaults to "variable" - value_name - Name to give to the `value` column. Defaults to "value" - streamable - Allow this node to run in the streaming engine. - If this runs in streaming, the output of the melt operation - will not have a stable ordering. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["x", "y", "z"], - ... "b": [1, 3, 5], - ... "c": [2, 4, 6], - ... } - ... ) - >>> import polars.selectors as cs - >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() - shape: (6, 3) - ┌─────┬──────────┬───────┐ - │ a ┆ variable ┆ value │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 │ - ╞═════╪══════════╪═══════╡ - │ x ┆ b ┆ 1 │ - │ y ┆ b ┆ 3 │ - │ z ┆ b ┆ 5 │ - │ x ┆ c ┆ 2 │ - │ y ┆ c ┆ 4 │ - │ z ┆ c ┆ 6 │ - └─────┴──────────┴───────┘ - - ''' - def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: - ''' - Apply a custom function. - - It is important that the function returns a Polars DataFrame. - - Parameters - ---------- - function - Lambda/ function to apply. - predicate_pushdown - Allow predicate pushdown optimization to pass this node. - projection_pushdown - Allow projection pushdown optimization to pass this node. - slice_pushdown - Allow slice pushdown optimization to pass this node. - no_optimizations - Turn off all optimizations past this point. - schema - Output schema of the function, if set to `None` we assume that the schema - will remain unchanged by the applied function. - validate_output_schema - It is paramount that polars\' schema is correct. This flag will ensure that - the output schema of this function will be checked with the expected schema. - Setting this to `False` will not do this check, but may lead to hard to - debug bugs. - streamable - Whether the function that is given is eligible to be running with the - streaming engine. That means that the function must produce the same result - when it is executed in batches or when it is be executed on the full - dataset. - - Warnings - -------- - The `schema` of a `LazyFrame` must always be correct. It is up to the caller - of this function to ensure that this invariant is upheld. - - It is important that the optimization flags are correct. If the custom function - for instance does an aggregation of a column, `predicate_pushdown` should not - be allowed, as this prunes rows and will influence your aggregation results. - - Examples - -------- - >>> lf = ( # doctest: +SKIP - ... pl.LazyFrame( - ... { - ... "a": pl.int_range(-100_000, 0, eager=True), - ... "b": pl.int_range(0, 100_000, eager=True), - ... } - ... ) - ... .map_batches(lambda x: 2 * x, streamable=True) - ... .collect(streaming=True) - ... ) - shape: (100_000, 2) - ┌─────────┬────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════════╪════════╡ - │ -200000 ┆ 0 │ - │ -199998 ┆ 2 │ - │ -199996 ┆ 4 │ - │ -199994 ┆ 6 │ - │ … ┆ … │ - │ -8 ┆ 199992 │ - │ -6 ┆ 199994 │ - │ -4 ┆ 199996 │ - │ -2 ┆ 199998 │ - └─────────┴────────┘ - - ''' - def interpolate(self) -> Self: - ''' - Interpolate intermediate values. The interpolation method is linear. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, None, 9, 10], - ... "bar": [6, 7, 9, None], - ... "baz": [1, None, None, 9], - ... } - ... ) - >>> lf.interpolate().collect() - shape: (4, 3) - ┌──────┬──────┬──────────┐ - │ foo ┆ bar ┆ baz │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 │ - ╞══════╪══════╪══════════╡ - │ 1.0 ┆ 6.0 ┆ 1.0 │ - │ 5.0 ┆ 7.0 ┆ 3.666667 │ - │ 9.0 ┆ 9.0 ┆ 6.333333 │ - │ 10.0 ┆ null ┆ 9.0 │ - └──────┴──────┴──────────┘ - - ''' - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: - ''' - Decompose struct columns into separate columns for each of their fields. - - The new columns will be inserted into the DataFrame at the location of the - struct column. - - Parameters - ---------- - columns - Name of the struct column(s) that should be unnested. - *more_columns - Additional columns to unnest, specified as positional arguments. - - Examples - -------- - >>> df = pl.LazyFrame( - ... { - ... "before": ["foo", "bar"], - ... "t_a": [1, 2], - ... "t_b": ["a", "b"], - ... "t_c": [True, None], - ... "t_d": [[1, 2], [3]], - ... "after": ["baz", "womp"], - ... } - ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") - >>> df.collect() - shape: (2, 3) - ┌────────┬─────────────────────┬───────┐ - │ before ┆ t_struct ┆ after │ - │ --- ┆ --- ┆ --- │ - │ str ┆ struct[4] ┆ str │ - ╞════════╪═════════════════════╪═══════╡ - │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ - │ bar ┆ {2,"b",null,[3]} ┆ womp │ - └────────┴─────────────────────┴───────┘ - >>> df.unnest("t_struct").collect() - shape: (2, 6) - ┌────────┬─────┬─────┬──────┬───────────┬───────┐ - │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ - ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ - │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ - │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ - └────────┴─────┴─────┴──────┴───────────┴───────┘ - - ''' - def merge_sorted(self, other: LazyFrame, key: str) -> Self: - ''' - Take two sorted DataFrames and merge them by the sorted key. - - The output of this operation will also be sorted. - It is the callers responsibility that the frames are sorted - by that key otherwise the output will not make sense. - - The schemas of both LazyFrames must be equal. - - Parameters - ---------- - other - Other DataFrame that must be merged - key - Key that is sorted. - - Examples - -------- - >>> df0 = pl.LazyFrame( - ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} - ... ).sort("age") - >>> df0.collect() - shape: (3, 2) - ┌───────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═══════╪═════╡ - │ bob ┆ 18 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └───────┴─────┘ - >>> df1 = pl.LazyFrame( - ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} - ... ).sort("age") - >>> df1.collect() - shape: (4, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - └────────┴─────┘ - >>> df0.merge_sorted(df1, key="age").collect() - shape: (7, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ bob ┆ 18 │ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └────────┴─────┘ - ''' - def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: - """ - Indicate that one or multiple columns are sorted. - - Parameters - ---------- - column - Columns that are sorted - more_columns - Additional columns that are sorted, specified as positional arguments. - descending - Whether the columns are sorted in descending order. - """ - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: - ''' - Update the values in this `LazyFrame` with the non-null values in `other`. - - Parameters - ---------- - other - LazyFrame that will be used to update the values - on - Column names that will be joined on; if given `None` the implicit row - index is used as a join key instead. - left_on - Join column(s) of the left DataFrame. - right_on - Join column(s) of the right DataFrame. - how : {\'left\', \'inner\', \'outer\'} - * \'left\' will keep all rows from the left table; rows may be duplicated - if multiple rows in the right frame match the left row\'s key. - * \'inner\' keeps only those rows where the key exists in both frames. - * \'outer\' will update existing rows where the key matches while also - adding any new rows contained in the given frame. - include_nulls - If True, null values from the right DataFrame will be used to update the - left DataFrame. - - Notes - ----- - This is syntactic sugar for a left/inner join, with an optional coalesce when - `include_nulls = False`. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "A": [1, 2, 3, 4], - ... "B": [400, 500, 600, 700], - ... } - ... ) - >>> lf.collect() - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 400 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - >>> new_lf = pl.LazyFrame( - ... { - ... "B": [-66, None, -99], - ... "C": [5, 3, 1], - ... } - ... ) - - Update `df` values with the non-null values in `new_df`, by row index: - - >>> lf.update(new_lf).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, by row index, - but only keeping those rows that are common to both frames: - - >>> lf.update(new_lf, how="inner").collect() - shape: (3, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() - shape: (5, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴─────┘ - - Update `df` values including null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> lf.update( - ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True - ... ).collect() - shape: (5, 2) - ┌─────┬──────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ null │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴──────┘ - - ''' - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: - """ - Start a group by operation. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.group_by`. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - """ - def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - """ - def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.9 - This method has been renamed to :func:`LazyFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - """ - def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.group_by_dynamic`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - include_boundaries - Add the lower and upper bound of the window to the "_lower_bound" and - "_upper_bound" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - ''' - def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: - """ - Apply a custom function. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.map_batches`. - - Parameters - ---------- - function - Lambda/ function to apply. - predicate_pushdown - Allow predicate pushdown optimization to pass this node. - projection_pushdown - Allow projection pushdown optimization to pass this node. - slice_pushdown - Allow slice pushdown optimization to pass this node. - no_optimizations - Turn off all optimizations past this point. - schema - Output schema of the function, if set to `None` we assume that the schema - will remain unchanged by the applied function. - validate_output_schema - It is paramount that polars' schema is correct. This flag will ensure that - the output schema of this function will be checked with the expected schema. - Setting this to `False` will not do this check, but may lead to hard to - debug bugs. - streamable - Whether the function that is given is eligible to be running with the - streaming engine. That means that the function must produce the same result - when it is executed in batches or when it is be executed on the full - dataset. - - """ - def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - fill None values with the result of this expression. - n - Number of places to shift (may be negative). - - """ - def take_every(self, n: int) -> Self: - """ - Take every nth row in the LazyFrame and return as a new LazyFrame. - - .. deprecated:: 0.19.0 - This method has been renamed to :meth:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - @property - def columns(self): ... - @property - def dtypes(self): ... - @property - def schema(self): ... - @property - def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/lazyframe/frame.pyi similarity index 99% rename from polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/lazyframe/frame rename to polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/lazyframe/frame.pyi index 561f5b2..f4d8116 100644 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/lazyframe/frame +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/lazyframe/frame.pyi @@ -1,3 +1,4 @@ +#: version 0.19.18 import P import np import pa diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/series/series deleted file mode 100644 index 4a40006..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/series/series +++ /dev/null @@ -1,4988 +0,0 @@ -import np as np -import pa as pa -import pd as pd -from builtins import PySeries -from datetime import date, datetime, timedelta -from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Object as Object, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown, Utf8 as Utf8 -from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat -from polars.exceptions import ShapeError as ShapeError -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence - -TYPE_CHECKING: bool -_PYARROW_AVAILABLE: bool - -class Series: - _s: _ClassVar[None] = ... - _accessors: _ClassVar[set] = ... - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array) -> Self: - """Construct a Series from an Arrow Array.""" - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: - """Construct a Series from a pandas Series or DatetimeIndex.""" - def _get_ptr(self) -> tuple[int, int, int]: - """ - Get a pointer to the start of the values buffer of a numeric Series. - - This will raise an error if the `Series` contains multiple chunks. - - This will return the offset, length and the pointer itself. - - """ - def __bool__(self) -> NoReturn: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Series: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... - def __eq__(self, other: Any) -> Series | Expr: ... - def __ne__(self, other: Any) -> Series | Expr: ... - def __gt__(self, other: Any) -> Series | Expr: ... - def __lt__(self, other: Any) -> Series | Expr: ... - def __ge__(self, other: Any) -> Series | Expr: ... - def __le__(self, other: Any) -> Series | Expr: ... - def le(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series <= other`.""" - def lt(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series < other`.""" - def eq(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series == other`.""" - def eq_missing(self, other: Any) -> Self | Expr: - ''' - Method equivalent of equality operator `series == other` where `None == None`. - - This differs from the standard `ne` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - See Also - -------- - ne_missing - eq - - Examples - -------- - >>> s1 = pl.Series("a", [333, 200, None]) - >>> s2 = pl.Series("a", [100, 200, None]) - >>> s1.eq(s2) - shape: (3,) - Series: \'a\' [bool] - [ - false - true - null - ] - >>> s1.eq_missing(s2) - shape: (3,) - Series: \'a\' [bool] - [ - false - true - true - ] - - ''' - def ne(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series != other`.""" - def ne_missing(self, other: Any) -> Self | Expr: - ''' - Method equivalent of equality operator `series != other` where `None == None`. - - This differs from the standard `ne` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - See Also - -------- - eq_missing - ne - - Examples - -------- - >>> s1 = pl.Series("a", [333, 200, None]) - >>> s2 = pl.Series("a", [100, 200, None]) - >>> s1.ne(s2) - shape: (3,) - Series: \'a\' [bool] - [ - true - false - null - ] - >>> s1.ne_missing(s2) - shape: (3,) - Series: \'a\' [bool] - [ - true - false - false - ] - - ''' - def ge(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series >= other`.""" - def gt(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series > other`.""" - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - def __add__(self, other: Any) -> Self | DataFrame | Expr: ... - def __sub__(self, other: Any) -> Self | Expr: ... - def __truediv__(self, other: Any) -> Series | Expr: ... - def __floordiv__(self, other: Any) -> Series | Expr: ... - def __invert__(self) -> Series: ... - def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... - def __mod__(self, other: Any) -> Series | Expr: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: - """ - Numpy __array__ interface protocol. - - Ensures that `np.asarray(pl.Series(..))` works as expected, see - https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. - """ - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: - """Numpy universal functions.""" - def __column_consortium_standard__(self) -> Any: - """ - Provide entry point to the Consortium DataFrame Standard API. - - This is developed and maintained outside of polars. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. - """ - def _repr_html_(self) -> str: - """Format output data in HTML for display in Jupyter Notebooks.""" - def item(self, index: int | None = ...) -> Any: - ''' - Return the Series as a scalar, or return the element at the given index. - - If no index is provided, this is equivalent to `s[0]`, with a check - that the shape is (1,). With an index, this is equivalent to `s[index]`. - - Examples - -------- - >>> s1 = pl.Series("a", [1]) - >>> s1.item() - 1 - >>> s2 = pl.Series("a", [9, 8, 7]) - >>> s2.cum_sum().item(-1) - 24 - - ''' - def estimated_size(self, unit: SizeUnit = ...) -> int | float: - ''' - Return an estimation of the total (heap) allocated size of the Series. - - Estimated size is given in the specified unit (bytes by default). - - This estimation is the sum of the size of its buffers, validity, including - nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the - size of 2 arrays is not the sum of the sizes computed from this function. In - particular, [`StructArray`]\'s size is an upper bound. - - When an array is sliced, its allocated size remains constant because the buffer - unchanged. However, this function will yield a smaller number. This is because - this function returns the visible size of the buffer, not its total capacity. - - FFI buffers are included in this estimation. - - Parameters - ---------- - unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} - Scale the returned size to the given unit. - - Examples - -------- - >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) - >>> s.estimated_size() - 4000000 - >>> s.estimated_size("mb") - 3.814697265625 - - ''' - def sqrt(self) -> Series: - """ - Compute the square root of the elements. - - Syntactic sugar for - - >>> pl.Series([1, 2]) ** 0.5 - shape: (2,) - Series: '' [f64] - [ - 1.0 - 1.414214 - ] - - """ - def cbrt(self) -> Series: - """ - Compute the cube root of the elements. - - Optimization for - - >>> pl.Series([1, 2]) ** (1.0 / 3) - shape: (2,) - Series: '' [f64] - [ - 1.0 - 1.259921 - ] - - """ - def any(self) -> bool | None: - """ - Return whether any of the values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is `None`. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - bool or None - - Examples - -------- - >>> pl.Series([True, False]).any() - True - >>> pl.Series([False, False]).any() - False - >>> pl.Series([None, False]).any() - False - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None - - """ - def all(self) -> bool | None: - """ - Return whether all values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is `None`. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - bool or None - - Examples - -------- - >>> pl.Series([True, True]).all() - True - >>> pl.Series([False, True]).all() - False - >>> pl.Series([None, True]).all() - True - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None - - """ - def log(self, base: float = ...) -> Series: - """Compute the logarithm to a given base.""" - def log1p(self) -> Series: - """Compute the natural logarithm of the input array plus one, element-wise.""" - def log10(self) -> Series: - """Compute the base 10 logarithm of the input array, element-wise.""" - def exp(self) -> Series: - """Compute the exponential, element-wise.""" - def drop_nulls(self) -> Series: - ''' - Drop all null values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nans - - Notes - ----- - A null value is not the same as a NaN value. - To drop NaN values, use :func:`drop_nans`. - - Examples - -------- - >>> s = pl.Series([1.0, None, 3.0, float("nan")]) - >>> s.drop_nulls() - shape: (3,) - Series: \'\' [f64] - [ - 1.0 - 3.0 - NaN - ] - - ''' - def drop_nans(self) -> Series: - ''' - Drop all floating point NaN values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nulls - - Notes - ----- - A NaN value is not the same as a null value. - To drop null values, use :func:`drop_nulls`. - - Examples - -------- - >>> s = pl.Series([1.0, None, 3.0, float("nan")]) - >>> s.drop_nans() - shape: (3,) - Series: \'\' [f64] - [ - 1.0 - null - 3.0 - ] - - ''' - def to_frame(self, name: str | None = ...) -> DataFrame: - ''' - Cast this Series to a DataFrame. - - Parameters - ---------- - name - optionally name/rename the Series column in the new DataFrame. - - Examples - -------- - >>> s = pl.Series("a", [123, 456]) - >>> df = s.to_frame() - >>> df - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 123 │ - │ 456 │ - └─────┘ - - >>> df = s.to_frame("xyz") - >>> df - shape: (2, 1) - ┌─────┐ - │ xyz │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 123 │ - │ 456 │ - └─────┘ - - ''' - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: - ''' - Quick summary statistics of a Series. - - Series with mixed datatypes will return summary statistics for the datatype of - the first value. - - Parameters - ---------- - percentiles - One or more percentiles to include in the summary statistics (if the - Series has a numeric dtype). All values must be in the range `[0, 1]`. - - Notes - ----- - The median is included by default as the 50% percentile. - - Returns - ------- - DataFrame - Mapping with summary statistics of a Series. - - Examples - -------- - >>> series_num = pl.Series([1, 2, 3, 4, 5]) - >>> series_num.describe() - shape: (9, 2) - ┌────────────┬──────────┐ - │ statistic ┆ value │ - │ --- ┆ --- │ - │ str ┆ f64 │ - ╞════════════╪══════════╡ - │ count ┆ 5.0 │ - │ null_count ┆ 0.0 │ - │ mean ┆ 3.0 │ - │ std ┆ 1.581139 │ - │ min ┆ 1.0 │ - │ 25% ┆ 2.0 │ - │ 50% ┆ 3.0 │ - │ 75% ┆ 4.0 │ - │ max ┆ 5.0 │ - └────────────┴──────────┘ - - >>> series_str = pl.Series(["a", "a", None, "b", "c"]) - >>> series_str.describe() - shape: (3, 2) - ┌────────────┬───────┐ - │ statistic ┆ value │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════════╪═══════╡ - │ count ┆ 5 │ - │ null_count ┆ 1 │ - │ unique ┆ 4 │ - └────────────┴───────┘ - - ''' - def sum(self) -> int | float: - ''' - Reduce this Series to the sum value. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.sum() - 6 - - ''' - def mean(self) -> int | float | None: - ''' - Reduce this Series to the mean value. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.mean() - 2.0 - - ''' - def product(self) -> int | float: - """Reduce this Series to the product value.""" - def pow(self, exponent: int | float | None | Series) -> Series: - ''' - Raise to the power of the given exponent. - - Parameters - ---------- - exponent - The exponent. Accepts Series input. - - Examples - -------- - >>> s = pl.Series("foo", [1, 2, 3, 4]) - >>> s.pow(3) - shape: (4,) - Series: \'foo\' [f64] - [ - 1.0 - 8.0 - 27.0 - 64.0 - ] - - ''' - def min(self) -> PythonLiteral | None: - ''' - Get the minimal value in this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.min() - 1 - - ''' - def max(self) -> PythonLiteral | None: - ''' - Get the maximum value in this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.max() - 3 - - ''' - def nan_max(self) -> int | float | date | datetime | timedelta | str: - """ - Get maximum value, but propagate/poison encountered NaN values. - - This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - """ - def nan_min(self) -> int | float | date | datetime | timedelta | str: - """ - Get minimum value, but propagate/poison encountered NaN values. - - This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - """ - def std(self, ddof: int = ...) -> float | None: - ''' - Get the standard deviation of this Series. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.std() - 1.0 - - ''' - def var(self, ddof: int = ...) -> float | None: - ''' - Get variance of this Series. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.var() - 1.0 - - ''' - def median(self) -> float | None: - ''' - Get the median of this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.median() - 2.0 - - ''' - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: - ''' - Get the quantile value of this Series. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.quantile(0.5) - 2.0 - - ''' - def to_dummies(self, separator: str = ...) -> DataFrame: - ''' - Get dummy/indicator variables. - - Parameters - ---------- - separator - Separator/delimiter used when generating column names. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.to_dummies() - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a_1 ┆ a_2 ┆ a_3 │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 0 ┆ 0 │ - │ 0 ┆ 1 ┆ 0 │ - │ 0 ┆ 0 ┆ 1 │ - └─────┴─────┴─────┘ - - ''' - def cut(self, breaks: Sequence[float]) -> Series | DataFrame: - ''' - Bin continuous values into discrete categories. - - Parameters - ---------- - breaks - List of unique cut points. - labels - Names of the categories. The number of labels must be equal to the number - of cut points plus one. - break_point_label - Name of the breakpoint column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - category_label - Name of the category column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - left_closed - Set the intervals to be left-closed instead of right-closed. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - as_series - If set to `False`, return a DataFrame containing the original values, - the breakpoints, and the categories. - - .. deprecated:: 0.19.0 - This parameter will be removed. The same behavior can be achieved by - setting `include_breaks=True`, unnesting the resulting struct Series, - and adding the result to the original Series. - - Returns - ------- - Series - Series of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise a Series of data type :class:`Struct`. - - See Also - -------- - qcut - - Examples - -------- - Divide the column into three categories. - - >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) - >>> s.cut([-1, 1], labels=["a", "b", "c"]) - shape: (5,) - Series: \'foo\' [cat] - [ - "a" - "a" - "b" - "b" - "c" - ] - - Create a DataFrame with the breakpoint and category for each value. - - >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") - >>> s.to_frame().with_columns(cut).unnest("cut") - shape: (5, 3) - ┌─────┬─────────────┬────────────┐ - │ foo ┆ break_point ┆ category │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪═════════════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴─────────────┴────────────┘ - - ''' - def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: - ''' - Bin continuous values into discrete categories based on their quantiles. - - Parameters - ---------- - quantiles - Either a list of quantile probabilities between 0 and 1 or a positive - integer determining the number of bins with uniform probability. - labels - Names of the categories. The number of labels must be equal to the number - of cut points plus one. - left_closed - Set the intervals to be left-closed instead of right-closed. - allow_duplicates - If set to `True`, duplicates in the resulting quantiles are dropped, - rather than raising a `DuplicateError`. This can happen even with unique - probabilities, depending on the data. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - break_point_label - Name of the breakpoint column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - category_label - Name of the category column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - as_series - If set to `False`, return a DataFrame containing the original values, - the breakpoints, and the categories. - - .. deprecated:: 0.19.0 - This parameter will be removed. The same behavior can be achieved by - setting `include_breaks=True`, unnesting the resulting struct Series, - and adding the result to the original Series. - - Returns - ------- - Series - Series of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise a Series of data type :class:`Struct`. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - See Also - -------- - cut - - Examples - -------- - Divide a column into three categories according to pre-defined quantile - probabilities. - - >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) - >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) - shape: (5,) - Series: \'foo\' [cat] - [ - "a" - "a" - "b" - "b" - "c" - ] - - Divide a column into two categories using uniform quantile probabilities. - - >>> s.qcut(2, labels=["low", "high"], left_closed=True) - shape: (5,) - Series: \'foo\' [cat] - [ - "low" - "low" - "high" - "high" - "high" - ] - - Create a DataFrame with the breakpoint and category for each value. - - >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") - >>> s.to_frame().with_columns(cut).unnest("cut") - shape: (5, 3) - ┌─────┬─────────────┬────────────┐ - │ foo ┆ break_point ┆ category │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪═════════════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴─────────────┴────────────┘ - - ''' - def rle(self) -> Series: - ''' - Get the lengths of runs of identical values. - - Returns - ------- - Series - Series of data type :class:`Struct` with Fields "lengths" and "values". - - Examples - -------- - >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) - >>> s.rle().struct.unnest() - shape: (6, 2) - ┌─────────┬────────┐ - │ lengths ┆ values │ - │ --- ┆ --- │ - │ i32 ┆ i64 │ - ╞═════════╪════════╡ - │ 2 ┆ 1 │ - │ 1 ┆ 2 │ - │ 1 ┆ 1 │ - │ 1 ┆ null │ - │ 1 ┆ 1 │ - │ 2 ┆ 3 │ - └─────────┴────────┘ - ''' - def rle_id(self) -> Series: - ''' - Map values to run IDs. - - Similar to RLE, but it maps each value to an ID corresponding to the run into - which it falls. This is especially useful when you want to define groups by - runs of identical values rather than the values themselves. - - Returns - ------- - Series - - See Also - -------- - rle - - Examples - -------- - >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) - >>> s.rle_id() - shape: (8,) - Series: \'s\' [u32] - [ - 0 - 0 - 1 - 2 - 3 - 4 - 5 - 5 - ] - ''' - def hist(self, bins: list[float] | None = ...) -> DataFrame: - ''' - Bin values into buckets and count their occurrences. - - Parameters - ---------- - bins - Discretizations to make. - If None given, we determine the boundaries based on the data. - bin_count - If no bins provided, this will be used to determine - the distance of the bins - - Returns - ------- - DataFrame - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Examples - -------- - >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) - >>> a.hist(bin_count=4) - shape: (5, 3) - ┌─────────────┬─────────────┬─────────┐ - │ break_point ┆ category ┆ a_count │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ cat ┆ u32 │ - ╞═════════════╪═════════════╪═════════╡ - │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ - │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ - │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ - │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ - │ inf ┆ (6.75, inf] ┆ 2 │ - └─────────────┴─────────────┴─────────┘ - - ''' - def value_counts(self) -> DataFrame: - ''' - Count the occurrences of unique values. - - Parameters - ---------- - sort - Sort the output by count in descending order. - If set to `False` (default), the order of the output is random. - parallel - Execute the computation in parallel. - - .. note:: - This option should likely not be enabled in a group by context, - as the computation is already parallelized per group. - - Returns - ------- - DataFrame - Mapping of unique values to their count. - - Examples - -------- - >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) - >>> s.value_counts() # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌───────┬────────┐ - │ color ┆ counts │ - │ --- ┆ --- │ - │ str ┆ u32 │ - ╞═══════╪════════╡ - │ red ┆ 2 │ - │ green ┆ 1 │ - │ blue ┆ 3 │ - └───────┴────────┘ - - Sort the output by count. - - shape: (3, 2) - ┌───────┬────────┐ - │ color ┆ counts │ - │ --- ┆ --- │ - │ str ┆ u32 │ - ╞═══════╪════════╡ - │ blue ┆ 3 │ - │ red ┆ 2 │ - │ green ┆ 1 │ - └───────┴────────┘ - - ''' - def unique_counts(self) -> Series: - ''' - Return a count of the unique values in the order of appearance. - - Examples - -------- - >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) - >>> s.unique_counts() - shape: (3,) - Series: \'id\' [u32] - [ - 1 - 2 - 3 - ] - - ''' - def entropy(self, base: float = ...) -> float | None: - """ - Computes the entropy. - - Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. - - Parameters - ---------- - base - Given base, defaults to `e` - normalize - Normalize pk if it doesn't sum to 1. - - Examples - -------- - >>> a = pl.Series([0.99, 0.005, 0.005]) - >>> a.entropy(normalize=True) - 0.06293300616044681 - >>> b = pl.Series([0.65, 0.10, 0.25]) - >>> b.entropy(normalize=True) - 0.8568409950394724 - - """ - def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: - ''' - Run an expression over a sliding window that increases `1` slot every iteration. - - Parameters - ---------- - expr - Expression to evaluate - min_periods - Number of valid values there should be in the window before the expression - is evaluated. valid values = `length - null_count` - parallel - Run in parallel. Don\'t do this in a group by or another operation that - already has much parallelization. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - This can be really slow as it can have `O(n^2)` complexity. Don\'t use this - for operations that visit all elements. - - Examples - -------- - >>> s = pl.Series("values", [1, 2, 3, 4, 5]) - >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) - shape: (5,) - Series: \'values\' [f64] - [ - 0.0 - -3.0 - -8.0 - -15.0 - -24.0 - ] - - ''' - def alias(self, name: str) -> Series: - ''' - Rename the series. - - Parameters - ---------- - name - The new name. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.alias("b") - shape: (3,) - Series: \'b\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def rename(self, name: str) -> Series: - ''' - Rename this Series. - - Alias for :func:`Series.alias`. - - Parameters - ---------- - name - New name. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.rename("b") - shape: (3,) - Series: \'b\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def chunk_lengths(self) -> list[int]: - ''' - Get the length of each individual chunk. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("a", [4, 5, 6]) - - Concatenate Series with rechunk = True - - >>> pl.concat([s, s2]).chunk_lengths() - [6] - - Concatenate Series with rechunk = False - - >>> pl.concat([s, s2], rechunk=False).chunk_lengths() - [3, 3] - - ''' - def n_chunks(self) -> int: - ''' - Get the number of chunks that this Series contains. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.n_chunks() - 1 - >>> s2 = pl.Series("a", [4, 5, 6]) - - Concatenate Series with rechunk = True - - >>> pl.concat([s, s2]).n_chunks() - 1 - - Concatenate Series with rechunk = False - - >>> pl.concat([s, s2], rechunk=False).n_chunks() - 2 - - ''' - def cum_max(self) -> Series: - ''' - Get an array with the cumulative max computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Examples - -------- - >>> s = pl.Series("s", [3, 5, 1]) - >>> s.cum_max() - shape: (3,) - Series: \'s\' [i64] - [ - 3 - 5 - 5 - ] - - ''' - def cum_min(self) -> Series: - ''' - Get an array with the cumulative min computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Examples - -------- - >>> s = pl.Series("s", [1, 2, 3]) - >>> s.cum_min() - shape: (3,) - Series: \'s\' [i64] - [ - 1 - 1 - 1 - ] - - ''' - def cum_prod(self) -> Series: - ''' - Get an array with the cumulative product computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.cum_prod() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 6 - ] - - ''' - def cum_sum(self) -> Series: - ''' - Get an array with the cumulative sum computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.cum_sum() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 3 - 6 - ] - - ''' - def slice(self, offset: int, length: int | None = ...) -> Series: - ''' - Get a slice of this Series. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4]) - >>> s.slice(1, 2) - shape: (2,) - Series: \'a\' [i64] - [ - 2 - 3 - ] - - ''' - def append(self, other: Series) -> Self: - ''' - Append a Series to this one. - - Parameters - ---------- - other - Series to append. - append_chunks - .. deprecated:: 0.18.8 - This argument will be removed and `append` will change to always - behave like `append_chunks=True` (the previous default). For the - behavior of `append_chunks=False`, use `Series.extend`. - - If set to `True` the append operation will add the chunks from `other` to - self. This is super cheap. - - If set to `False` the append operation will do the same as - `DataFrame.extend` which extends the memory backed by this `Series` with - the values from `other`. - - Different from `append chunks`, `extend` appends the data from `other` to - the underlying memory locations and thus may cause a reallocation (which are - expensive). - - If this does not cause a reallocation, the resulting data structure will not - have any extra chunks and thus will yield faster queries. - - Prefer `extend` over `append_chunks` when you want to do a query after a - single append. For instance during online operations where you add `n` rows - and rerun a query. - - Prefer `append_chunks` over `extend` when you want to append many times - before doing a query. For instance when you read in multiple files and when - to store them in a single `Series`. In the latter case, finish the sequence - of `append_chunks` operations with a `rechunk`. - - Warnings - -------- - This method modifies the series in-place. The series is returned for - convenience only. - - See Also - -------- - extend - - Examples - -------- - >>> a = pl.Series("a", [1, 2, 3]) - >>> b = pl.Series("b", [4, 5]) - >>> a.append(b) - shape: (5,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - 5 - ] - - The resulting series will consist of multiple chunks. - - >>> a.n_chunks() - 2 - - ''' - def extend(self, other: Series) -> Self: - ''' - Extend the memory backed by this Series with the values from another. - - Different from `append`, which adds the chunks from `other` to the chunks of - this series, `extend` appends the data from `other` to the underlying memory - locations and thus may cause a reallocation (which is expensive). - - If this does `not` cause a reallocation, the resulting data structure will not - have any extra chunks and thus will yield faster queries. - - Prefer `extend` over `append` when you want to do a query after a single - append. For instance, during online operations where you add `n` rows - and rerun a query. - - Prefer `append` over `extend` when you want to append many times - before doing a query. For instance, when you read in multiple files and want - to store them in a single `Series`. In the latter case, finish the sequence - of `append` operations with a `rechunk`. - - Parameters - ---------- - other - Series to extend the series with. - - Warnings - -------- - This method modifies the series in-place. The series is returned for - convenience only. - - See Also - -------- - append - - Examples - -------- - >>> a = pl.Series("a", [1, 2, 3]) - >>> b = pl.Series("b", [4, 5]) - >>> a.extend(b) - shape: (5,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - 5 - ] - - The resulting series will consist of a single chunk. - - >>> a.n_chunks() - 1 - - ''' - def filter(self, predicate: Series | list[bool]) -> Self: - ''' - Filter elements by a boolean mask. - - The original order of the remaining elements is preserved. - - Parameters - ---------- - predicate - Boolean mask. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> mask = pl.Series("", [True, False, True]) - >>> s.filter(mask) - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 3 - ] - - ''' - def head(self, n: int = ...) -> Series: - ''' - Get the first `n` elements. - - Parameters - ---------- - n - Number of elements to return. If a negative value is passed, return all - elements except the last `abs(n)`. - - See Also - -------- - tail, slice - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.head(3) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - Pass a negative value to get all rows `except` the last `abs(n)`. - - >>> s.head(-3) - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 2 - ] - - ''' - def tail(self, n: int = ...) -> Series: - ''' - Get the last `n` elements. - - Parameters - ---------- - n - Number of elements to return. If a negative value is passed, return all - elements except the first `abs(n)`. - - See Also - -------- - head, slice - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.tail(3) - shape: (3,) - Series: \'a\' [i64] - [ - 3 - 4 - 5 - ] - - Pass a negative value to get all rows `except` the first `abs(n)`. - - >>> s.tail(-3) - shape: (2,) - Series: \'a\' [i64] - [ - 4 - 5 - ] - - ''' - def limit(self, n: int = ...) -> Series: - """ - Get the first `n` elements. - - Alias for :func:`Series.head`. - - Parameters - ---------- - n - Number of elements to return. If a negative value is passed, return all - elements except the last `abs(n)`. - - See Also - -------- - head - - """ - def gather_every(self, n: int) -> Series: - ''' - Take every nth value in the Series and return as new Series. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4]) - >>> s.gather_every(2) - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 3 - ] - - ''' - def sort(self) -> Self: - ''' - Sort this Series. - - Parameters - ---------- - descending - Sort in descending order. - in_place - Sort in-place. - - Examples - -------- - >>> s = pl.Series("a", [1, 3, 4, 2]) - >>> s.sort() - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - ] - >>> s.sort(descending=True) - shape: (4,) - Series: \'a\' [i64] - [ - 4 - 3 - 2 - 1 - ] - - ''' - def top_k(self, k: int | IntoExprColumn = ...) -> Series: - ''' - Return the `k` largest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - bottom_k - - Examples - -------- - >>> s = pl.Series("a", [2, 5, 1, 4, 3]) - >>> s.top_k(3) - shape: (3,) - Series: \'a\' [i64] - [ - 5 - 4 - 3 - ] - - ''' - def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: - ''' - Return the `k` smallest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - top_k - - Examples - -------- - >>> s = pl.Series("a", [2, 5, 1, 4, 3]) - >>> s.bottom_k(3) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def arg_sort(self) -> Series: - ''' - Get the index values that would sort this Series. - - Parameters - ---------- - descending - Sort in descending order. - nulls_last - Place null values last instead of first. - - Examples - -------- - >>> s = pl.Series("a", [5, 3, 4, 1, 2]) - >>> s.arg_sort() - shape: (5,) - Series: \'a\' [u32] - [ - 3 - 4 - 1 - 2 - 0 - ] - - ''' - def arg_unique(self) -> Series: - ''' - Get unique index as Series. - - Returns - ------- - Series - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.arg_unique() - shape: (3,) - Series: \'a\' [u32] - [ - 0 - 1 - 3 - ] - - ''' - def arg_min(self) -> int | None: - ''' - Get the index of the minimal value. - - Returns - ------- - int - - Examples - -------- - >>> s = pl.Series("a", [3, 2, 1]) - >>> s.arg_min() - 2 - - ''' - def arg_max(self) -> int | None: - ''' - Get the index of the maximal value. - - Returns - ------- - int - - Examples - -------- - >>> s = pl.Series("a", [3, 2, 1]) - >>> s.arg_max() - 0 - - ''' - def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: - """ - Find indices where elements should be inserted to maintain order. - - .. math:: a[i-1] < v <= a[i] - - Parameters - ---------- - element - Expression or scalar value. - side : {'any', 'left', 'right'} - If 'any', the index of the first suitable location found is given. - If 'left', the index of the leftmost suitable location found is given. - If 'right', return the rightmost suitable location found is given. - - """ - def unique(self) -> Series: - ''' - Get unique elements in series. - - Parameters - ---------- - maintain_order - Maintain order of data. This requires more work. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.unique().sort() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: - ''' - Take values by index. - - Parameters - ---------- - indices - Index location used for selection. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4]) - >>> s.gather([1, 3]) - shape: (2,) - Series: \'a\' [i64] - [ - 2 - 4 - ] - - ''' - def null_count(self) -> int: - """Count the null values in this Series.""" - def has_validity(self) -> bool: - """ - Return True if the Series has a validity bitmask. - - If there is no mask, it means that there are no `null` values. - - Notes - ----- - While the *absence* of a validity bitmask guarantees that a Series does not - have `null` values, the converse is not true, eg: the *presence* of a - bitmask does not mean that there are null values, as every value of the - bitmask could be `false`. - - To confirm that a column has `null` values use :func:`null_count`. - - """ - def is_empty(self) -> bool: - ''' - Check if the Series is empty. - - Examples - -------- - >>> s = pl.Series("a", [], dtype=pl.Float32) - >>> s.is_empty() - True - - ''' - def is_sorted(self) -> bool: - """ - Check if the Series is sorted. - - Parameters - ---------- - descending - Check if the Series is sorted in descending order - - """ - def not_(self) -> Series: - ''' - Negate a boolean Series. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [True, False, False]) - >>> s.not_() - shape: (3,) - Series: \'a\' [bool] - [ - false - true - true - ] - - ''' - def is_null(self) -> Series: - ''' - Returns a boolean Series indicating which values are null. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) - >>> s.is_null() - shape: (4,) - Series: \'a\' [bool] - [ - false - false - false - true - ] - - ''' - def is_not_null(self) -> Series: - ''' - Returns a boolean Series indicating which values are not null. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) - >>> s.is_not_null() - shape: (4,) - Series: \'a\' [bool] - [ - true - true - true - false - ] - - ''' - def is_finite(self) -> Series: - ''' - Returns a boolean Series indicating which values are finite. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, np.inf]) - >>> s.is_finite() - shape: (3,) - Series: \'a\' [bool] - [ - true - true - false - ] - - ''' - def is_infinite(self) -> Series: - ''' - Returns a boolean Series indicating which values are infinite. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, np.inf]) - >>> s.is_infinite() - shape: (3,) - Series: \'a\' [bool] - [ - false - false - true - ] - - ''' - def is_nan(self) -> Series: - ''' - Returns a boolean Series indicating which values are not NaN. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) - >>> s.is_nan() - shape: (4,) - Series: \'a\' [bool] - [ - false - false - false - true - ] - - ''' - def is_not_nan(self) -> Series: - ''' - Returns a boolean Series indicating which values are not NaN. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) - >>> s.is_not_nan() - shape: (4,) - Series: \'a\' [bool] - [ - true - true - true - false - ] - - ''' - def is_in(self, other: Series | Collection[Any]) -> Series: - ''' - Check if elements of this Series are in the other Series. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("b", [2, 4]) - >>> s2.is_in(s) - shape: (2,) - Series: \'b\' [bool] - [ - true - false - ] - - >>> # check if some values are a member of sublists - >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) - >>> optional_members = pl.Series("optional_members", [1, 2, 3]) - >>> print(sets) - shape: (3,) - Series: \'sets\' [list[i64]] - [ - [1, 2, 3] - [1, 2] - [9, 10] - ] - >>> print(optional_members) - shape: (3,) - Series: \'optional_members\' [i64] - [ - 1 - 2 - 3 - ] - >>> optional_members.is_in(sets) - shape: (3,) - Series: \'optional_members\' [bool] - [ - true - true - false - ] - - ''' - def arg_true(self) -> Series: - ''' - Get index values where Boolean Series evaluate True. - - Returns - ------- - Series - Series of data type :class:`UInt32`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> (s == 2).arg_true() - shape: (1,) - Series: \'a\' [u32] - [ - 1 - ] - - ''' - def is_unique(self) -> Series: - ''' - Get mask of all unique values. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.is_unique() - shape: (4,) - Series: \'a\' [bool] - [ - true - false - false - true - ] - - ''' - def is_first_distinct(self) -> Series: - """ - Return a boolean mask indicating the first occurrence of each distinct value. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series([1, 1, 2, 3, 2]) - >>> s.is_first_distinct() - shape: (5,) - Series: '' [bool] - [ - true - false - true - true - false - ] - - """ - def is_last_distinct(self) -> Series: - """ - Return a boolean mask indicating the last occurrence of each distinct value. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series([1, 1, 2, 3, 2]) - >>> s.is_last_distinct() - shape: (5,) - Series: '' [bool] - [ - false - true - false - true - true - ] - - """ - def is_duplicated(self) -> Series: - ''' - Get mask of all duplicated values. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.is_duplicated() - shape: (4,) - Series: \'a\' [bool] - [ - false - true - true - false - ] - - ''' - def explode(self) -> Series: - """ - Explode a list Series. - - This means that every item is expanded to a new row. - - Returns - ------- - Series - Series with the data type of the list elements. - - See Also - -------- - Series.list.explode : Explode a list column. - Series.str.explode : Explode a string column. - - """ - def equals(self, other: Series) -> bool: - ''' - Check whether the Series is equal to another Series. - - Parameters - ---------- - other - Series to compare with. - null_equal - Consider null values as equal. - strict - Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a - `pl.Int64` will return `False`. - - See Also - -------- - assert_series_equal - - Examples - -------- - >>> s1 = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("b", [4, 5, 6]) - >>> s1.equals(s1) - True - >>> s1.equals(s2) - False - ''' - def len(self) -> int: - ''' - Return the number of elements in this Series. - - Null values are treated like regular elements in this context. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, None]) - >>> s.len() - 3 - - ''' - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: - ''' - Cast between data types. - - Parameters - ---------- - dtype - DataType to cast to. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> s = pl.Series("a", [True, False, True]) - >>> s - shape: (3,) - Series: \'a\' [bool] - [ - true - false - true - ] - - >>> s.cast(pl.UInt32) - shape: (3,) - Series: \'a\' [u32] - [ - 1 - 0 - 1 - ] - - ''' - def to_physical(self) -> Series: - ''' - Cast to physical representation of the logical dtype. - - - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` - - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` - - `List(inner)` -> `List(physical of inner)` - - Other data types will be left unchanged. - - Examples - -------- - Replicating the pandas - `pd.Series.factorize - `_ - method. - - >>> s = pl.Series("values", ["a", None, "x", "a"]) - >>> s.cast(pl.Categorical).to_physical() - shape: (4,) - Series: \'values\' [u32] - [ - 0 - null - 1 - 0 - ] - - ''' - def to_list(self) -> list[Any]: - ''' - Convert this Series to a Python List. This operation clones data. - - Parameters - ---------- - use_pyarrow - Use pyarrow for the conversion. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.to_list() - [1, 2, 3] - >>> type(s.to_list()) - - - ''' - def rechunk(self) -> Self: - """ - Create a single chunk of memory for this Series. - - Parameters - ---------- - in_place - In place or not. - - """ - def reverse(self) -> Series: - ''' - Return Series in reverse order. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) - >>> s.reverse() - shape: (3,) - Series: \'a\' [i8] - [ - 3 - 2 - 1 - ] - - ''' - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: - ''' - Get a boolean mask of the values that fall between the given start/end values. - - Parameters - ---------- - lower_bound - Lower bound value. Accepts expression input. Non-expression inputs - (including strings) are parsed as literals. - upper_bound - Upper bound value. Accepts expression input. Non-expression inputs - (including strings) are parsed as literals. - closed : {\'both\', \'left\', \'right\', \'none\'} - Define which sides of the interval are closed (inclusive). - - Examples - -------- - >>> s = pl.Series("num", [1, 2, 3, 4, 5]) - >>> s.is_between(2, 4) - shape: (5,) - Series: \'num\' [bool] - [ - false - true - true - true - false - ] - - Use the `closed` argument to include or exclude the values at the bounds: - - >>> s.is_between(2, 4, closed="left") - shape: (5,) - Series: \'num\' [bool] - [ - false - true - true - false - false - ] - - You can also use strings as well as numeric/temporal values: - - >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) - >>> s.is_between("b", "d", closed="both") - shape: (5,) - Series: \'s\' [bool] - [ - false - true - true - true - false - ] - - ''' - def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: - ''' - Convert this Series to numpy. - - This operation may clone data but is completely safe. Note that: - - - data which is purely numeric AND without null values is not cloned; - - floating point `nan` values can be zero-copied; - - booleans can\'t be zero-copied. - - To ensure that no data is cloned, set `zero_copy_only=True`. - - Parameters - ---------- - *args - args will be sent to pyarrow.Array.to_numpy. - zero_copy_only - If True, an exception will be raised if the conversion to a numpy - array would require copying the underlying data (e.g. in presence - of nulls, or for non-primitive types). - writable - For numpy arrays created with zero copy (view on the Arrow data), - the resulting array is not writable (Arrow data is immutable). - By setting this to True, a copy of the array is made to ensure - it is writable. - use_pyarrow - Use `pyarrow.Array.to_numpy - `_ - - for the conversion to numpy. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> arr = s.to_numpy() - >>> arr # doctest: +IGNORE_RESULT - array([1, 2, 3], dtype=int64) - >>> type(arr) - - - ''' - def _view(self) -> SeriesView: - ''' - Get a view into this Series data with a numpy array. - - This operation doesn\'t clone data, but does not include missing values. - - Returns - ------- - SeriesView - - Parameters - ---------- - ignore_nulls - If True then nulls are converted to 0. - If False then an Exception is raised if nulls are present. - - Examples - -------- - >>> s = pl.Series("a", [1, None]) - >>> s._view(ignore_nulls=True) - SeriesView([1, 0]) - - ''' - def to_arrow(self) -> pa.Array: - ''' - Get the underlying Arrow Array. - - If the Series contains only a single chunk this operation is zero copy. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s = s.to_arrow() - >>> s # doctest: +ELLIPSIS - - [ - 1, - 2, - 3 - ] - - ''' - def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: - ''' - Convert this Series to a pandas Series. - - This requires that :mod:`pandas` and :mod:`pyarrow` are installed. - This operation clones data, unless `use_pyarrow_extension_array=True`. - - Parameters - ---------- - use_pyarrow_extension_array - Further operations on this Pandas series, might trigger conversion to numpy. - Use PyArrow backed-extension array instead of numpy array for pandas - Series. This allows zero copy operations and preservation of nulls - values. - Further operations on this pandas Series, might trigger conversion - to NumPy arrays if that operation is not supported by pyarrow compute - functions. - kwargs - Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. - - Examples - -------- - >>> s1 = pl.Series("a", [1, 2, 3]) - >>> s1.to_pandas() - 0 1 - 1 2 - 2 3 - Name: a, dtype: int64 - >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP - 0 1 - 1 2 - 2 3 - Name: a, dtype: int64[pyarrow] - >>> s2 = pl.Series("b", [1, 2, None, 4]) - >>> s2.to_pandas() - 0 1.0 - 1 2.0 - 2 NaN - 3 4.0 - Name: b, dtype: float64 - >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP - 0 1 - 1 2 - 2 - 3 4 - Name: b, dtype: int64[pyarrow] - - ''' - def to_init_repr(self, n: int = ...) -> str: - ''' - Convert Series to instantiatable string representation. - - Parameters - ---------- - n - Only use first n elements. - - See Also - -------- - polars.Series.to_init_repr - polars.from_repr - - Examples - -------- - >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) - >>> print(s.to_init_repr()) - pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) - >>> s_from_str_repr = eval(s.to_init_repr()) - >>> s_from_str_repr - shape: (4,) - Series: \'a\' [i16] - [ - 1 - 2 - null - 4 - ] - - ''' - def set(self, filter: Series, value: int | float | str | bool | None) -> Series: - ''' - Set masked values. - - Parameters - ---------- - filter - Boolean mask. - value - Value with which to replace the masked values. - - Notes - ----- - Use of this function is frequently an anti-pattern, as it can - block optimisation (predicate pushdown, etc). Consider using - `pl.when(predicate).then(value).otherwise(self)` instead. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.set(s == 2, 10) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 10 - 3 - ] - - It is better to implement this as follows: - - >>> s.to_frame().select( - ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) - ... ) - shape: (3, 1) - ┌─────────┐ - │ literal │ - │ --- │ - │ i64 │ - ╞═════════╡ - │ 1 │ - │ 10 │ - │ 3 │ - └─────────┘ - - ''' - def scatter(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: - ''' - Set values at the index locations. - - Parameters - ---------- - indices - Integers representing the index locations. - values - Replacement values. - - Notes - ----- - Use of this function is frequently an anti-pattern, as it can - block optimization (predicate pushdown, etc). Consider using - `pl.when(predicate).then(value).otherwise(self)` instead. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.scatter(1, 10) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 10 - 3 - ] - - It is better to implement this as follows: - - >>> s.to_frame().with_row_count("row_nr").select( - ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) - ... ) - shape: (3, 1) - ┌─────────┐ - │ literal │ - │ --- │ - │ i64 │ - ╞═════════╡ - │ 1 │ - │ 10 │ - │ 3 │ - └─────────┘ - - ''' - def clear(self, n: int = ...) -> Series: - ''' - Create an empty copy of the current Series, with zero to \'n\' elements. - - The copy has an identical name/dtype, but no data. - - Parameters - ---------- - n - Number of (empty) elements to return in the cleared frame. - - See Also - -------- - clone : Cheap deepcopy/clone. - - Examples - -------- - >>> s = pl.Series("a", [None, True, False]) - >>> s.clear() - shape: (0,) - Series: \'a\' [bool] - [ - ] - - >>> s.clear(n=2) - shape: (2,) - Series: \'a\' [bool] - [ - null - null - ] - - ''' - def clone(self) -> Self: - ''' - Create a copy of this Series. - - This is a cheap operation that does not copy data. - - See Also - -------- - clear : Create an empty copy of the current Series, with identical - schema but no data. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.clone() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def fill_nan(self, value: int | float | Expr | None) -> Series: - ''' - Fill floating point NaN value with a fill value. - - Parameters - ---------- - value - Value used to fill NaN values. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, float("nan")]) - >>> s.fill_nan(0) - shape: (4,) - Series: \'a\' [f64] - [ - 1.0 - 2.0 - 3.0 - 0.0 - ] - - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: - ''' - Fill null values using the specified value or strategy. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, None]) - >>> s.fill_null(strategy="forward") - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 3 - ] - >>> s.fill_null(strategy="min") - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 1 - ] - >>> s = pl.Series("b", ["x", None, "z"]) - >>> s.fill_null(pl.lit("")) - shape: (3,) - Series: \'b\' [str] - [ - "x" - "" - "z" - ] - - ''' - def floor(self) -> Series: - ''' - Rounds down to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) - >>> s.floor() - shape: (3,) - Series: \'a\' [f64] - [ - 1.0 - 2.0 - 3.0 - ] - - ''' - def ceil(self) -> Series: - ''' - Rounds up to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) - >>> s.ceil() - shape: (3,) - Series: \'a\' [f64] - [ - 2.0 - 3.0 - 4.0 - ] - - ''' - def round(self, decimals: int = ...) -> Series: - ''' - Round underlying floating point data by `decimals` digits. - - Examples - -------- - >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) - >>> s.round(2) - shape: (3,) - Series: \'a\' [f64] - [ - 1.12 - 2.57 - 3.9 - ] - - Parameters - ---------- - decimals - number of decimals to round by. - - ''' - def round_sig_figs(self, digits: int) -> Series: - """ - Round to a number of significant figures. - - Parameters - ---------- - digits - Number of significant figures to round to. - - Examples - -------- - >>> s = pl.Series([0.01234, 3.333, 1234.0]) - >>> s.round_sig_figs(2) - shape: (3,) - Series: '' [f64] - [ - 0.012 - 3.3 - 1200.0 - ] - - """ - def dot(self, other: Series | ArrayLike) -> float | None: - ''' - Compute the dot/inner product between two Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) - >>> s.dot(s2) - 32.0 - - Parameters - ---------- - other - Series (or array) to compute dot product with. - - ''' - def mode(self) -> Series: - ''' - Compute the most occurring value(s). - - Can return multiple Values. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.mode() - shape: (1,) - Series: \'a\' [i64] - [ - 2 - ] - - ''' - def sign(self) -> Series: - ''' - Compute the element-wise indication of the sign. - - The returned values can be -1, 0, or 1: - - * -1 if x < 0. - * 0 if x == 0. - * 1 if x > 0. - - (null values are preserved as-is). - - Examples - -------- - >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) - >>> s.sign() - shape: (5,) - Series: \'a\' [i64] - [ - -1 - 0 - 0 - 1 - null - ] - - ''' - def sin(self) -> Series: - ''' - Compute the element-wise value for the sine. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.sin() - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 1.0 - 1.2246e-16 - ] - - ''' - def cos(self) -> Series: - ''' - Compute the element-wise value for the cosine. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.cos() - shape: (3,) - Series: \'a\' [f64] - [ - 1.0 - 6.1232e-17 - -1.0 - ] - - ''' - def tan(self) -> Series: - ''' - Compute the element-wise value for the tangent. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.tan() - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 1.6331e16 - -1.2246e-16 - ] - - ''' - def cot(self) -> Series: - ''' - Compute the element-wise value for the cotangent. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.cot() - shape: (3,) - Series: \'a\' [f64] - [ - inf - 6.1232e-17 - -8.1656e15 - ] - - ''' - def arcsin(self) -> Series: - ''' - Compute the element-wise value for the inverse sine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arcsin() - shape: (3,) - Series: \'a\' [f64] - [ - 1.570796 - 0.0 - -1.570796 - ] - - ''' - def arccos(self) -> Series: - ''' - Compute the element-wise value for the inverse cosine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arccos() - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 1.570796 - 3.141593 - ] - - ''' - def arctan(self) -> Series: - ''' - Compute the element-wise value for the inverse tangent. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arctan() - shape: (3,) - Series: \'a\' [f64] - [ - 0.785398 - 0.0 - -0.785398 - ] - - ''' - def arcsinh(self) -> Series: - ''' - Compute the element-wise value for the inverse hyperbolic sine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arcsinh() - shape: (3,) - Series: \'a\' [f64] - [ - 0.881374 - 0.0 - -0.881374 - ] - - ''' - def arccosh(self) -> Series: - ''' - Compute the element-wise value for the inverse hyperbolic cosine. - - Examples - -------- - >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) - >>> s.arccosh() - shape: (4,) - Series: \'a\' [f64] - [ - 2.292432 - 0.0 - NaN - NaN - ] - - ''' - def arctanh(self) -> Series: - ''' - Compute the element-wise value for the inverse hyperbolic tangent. - - Examples - -------- - >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) - >>> s.arctanh() - shape: (7,) - Series: \'a\' [f64] - [ - NaN - inf - 0.549306 - 0.0 - -0.549306 - -inf - NaN - ] - - ''' - def sinh(self) -> Series: - ''' - Compute the element-wise value for the hyperbolic sine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.sinh() - shape: (3,) - Series: \'a\' [f64] - [ - 1.175201 - 0.0 - -1.175201 - ] - - ''' - def cosh(self) -> Series: - ''' - Compute the element-wise value for the hyperbolic cosine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.cosh() - shape: (3,) - Series: \'a\' [f64] - [ - 1.543081 - 1.0 - 1.543081 - ] - - ''' - def tanh(self) -> Series: - ''' - Compute the element-wise value for the hyperbolic tangent. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.tanh() - shape: (3,) - Series: \'a\' [f64] - [ - 0.761594 - 0.0 - -0.761594 - ] - - ''' - def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - ''' - Map a custom/user-defined function (UDF) over elements in this Series. - - .. warning:: - This method is much slower than the native expressions API. - Only use it if you cannot implement your logic otherwise. - - If the function returns a different datatype, the return_dtype arg should - be set, otherwise the method will fail. - - Implementing logic using a Python function is almost always *significantly* - slower and more memory intensive than implementing the same logic using - the native expression API because: - - - The native expression engine runs in Rust; UDFs run in Python. - - Use of Python UDFs forces the DataFrame to be materialized in memory. - - Polars-native expressions can be parallelised (UDFs typically cannot). - - Polars-native expressions can be logically optimised (UDFs cannot). - - Wherever possible you should strongly prefer the native expression API - to achieve the best performance. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output datatype. If none is given, the same datatype as this Series will be - used. - skip_nulls - Nulls will be skipped and not passed to the python function. - This is faster because python can be skipped and because we call - more specialized functions. - - Warnings - -------- - If `return_dtype` is not provided, this may lead to unexpected results. - We allow this, but it is considered a bug in the user\'s query. - - Notes - ----- - If your function is expensive and you don\'t want it to be called more than - once for a given input, consider applying an `@lru_cache` decorator to it. - If your data is suitable you may achieve *significant* speedups. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP - shape: (3,) - Series: \'a\' [i64] - [ - 11 - 12 - 13 - ] - - Returns - ------- - Series - - ''' - def shift(self, n: int = ...) -> Series: - """ - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. Accepts expression input. - Non-expression inputs are parsed as literals. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> s = pl.Series([1, 2, 3, 4]) - >>> s.shift() - shape: (4,) - Series: '' [i64] - [ - null - 1 - 2 - 3 - ] - - Pass a negative value to shift in the opposite direction instead. - - >>> s.shift(-2) - shape: (4,) - Series: '' [i64] - [ - 3 - 4 - null - null - ] - - Specify `fill_value` to fill the resulting null values. - - >>> s.shift(-2, fill_value=100) - shape: (4,) - Series: '' [i64] - [ - 3 - 4 - 100 - 100 - ] - - """ - def zip_with(self, mask: Series, other: Series) -> Self: - """ - Take values from self or other based on the given mask. - - Where mask evaluates true, take values from self. Where mask evaluates false, - take values from other. - - Parameters - ---------- - mask - Boolean Series. - other - Series of same type. - - Returns - ------- - Series - - Examples - -------- - >>> s1 = pl.Series([1, 2, 3, 4, 5]) - >>> s2 = pl.Series([5, 4, 3, 2, 1]) - >>> s1.zip_with(s1 < s2, s2) - shape: (5,) - Series: '' [i64] - [ - 1 - 2 - 3 - 2 - 1 - ] - >>> mask = pl.Series([True, False, True, False, True]) - >>> s1.zip_with(mask, s2) - shape: (5,) - Series: '' [i64] - [ - 1 - 4 - 3 - 2 - 5 - ] - - """ - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling min (moving min) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their min. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [100, 200, 300, 400, 500]) - >>> s.rolling_min(window_size=3) - shape: (5,) - Series: \'a\' [i64] - [ - null - null - 100 - 200 - 300 - ] - - ''' - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling max (moving max) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their max. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [100, 200, 300, 400, 500]) - >>> s.rolling_max(window_size=2) - shape: (5,) - Series: \'a\' [i64] - [ - null - 200 - 300 - 400 - 500 - ] - - ''' - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling mean (moving mean) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their mean. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [100, 200, 300, 400, 500]) - >>> s.rolling_mean(window_size=2) - shape: (5,) - Series: \'a\' [f64] - [ - null - 150.0 - 250.0 - 350.0 - 450.0 - ] - - ''' - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling sum (moving sum) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their sum. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length of the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.rolling_sum(window_size=2) - shape: (5,) - Series: \'a\' [i64] - [ - null - 3 - 5 - 7 - 9 - ] - - ''' - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling std dev. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their std dev. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_std(window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.0 - 1.0 - 1.527525 - 2.0 - ] - - ''' - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling variance. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their variance. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_var(window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.0 - 1.0 - 2.333333 - 4.0 - ] - - ''' - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a custom rolling window function. - - .. warning:: - Computing custom functions is extremely slow. Use specialized rolling - functions such as :func:`Series.rolling_sum` if at all possible. - - Parameters - ---------- - function - Custom aggregation function. - window_size - Size of the window. The window at a given row will include the row - itself and the `window_size - 1` elements before it. - weights - A list of weights with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window. - - Warnings - -------- - - - Examples - -------- - >>> from numpy import nansum - >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) - >>> s.rolling_map(nansum, window_size=3) - shape: (5,) - Series: \'\' [f64] - [ - null - null - 22.0 - 11.0 - 17.0 - ] - - ''' - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling median. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_median(window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 2.0 - 3.0 - 4.0 - 6.0 - ] - - ''' - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling quantile. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_quantile(quantile=0.33, window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.0 - 2.0 - 3.0 - 4.0 - ] - >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.66 - 2.66 - 3.66 - 5.32 - ] - - ''' - def rolling_skew(self, window_size: int) -> Series: - """ - Compute a rolling skew. - - The window at a given row includes the row itself and the - `window_size - 1` elements before it. - - Parameters - ---------- - window_size - Integer size of the rolling window. - bias - If False, the calculations are corrected for statistical bias. - - Examples - -------- - >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) - shape: (4,) - Series: '' [f64] - [ - null - null - 0.381802 - 0.47033 - ] - - Note how the values match - - >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() - (0.38180177416060584, 0.47033046033698594) - - """ - def sample(self, n: int | None = ...) -> Series: - ''' - Sample from this Series. - - Parameters - ---------- - n - Number of items to return. Cannot be used with `fraction`. Defaults to 1 if - `fraction` is None. - fraction - Fraction of items to return. Cannot be used with `n`. - with_replacement - Allow values to be sampled more than once. - shuffle - Shuffle the order of sampled data points. - seed - Seed for the random number generator. If set to None (default), a - random seed is generated for each sample operation. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 5 - ] - - ''' - def peak_max(self) -> Self: - ''' - Get a boolean mask of the local maximum peaks. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.peak_max() - shape: (5,) - Series: \'a\' [bool] - [ - false - false - false - false - true - ] - - ''' - def peak_min(self) -> Self: - ''' - Get a boolean mask of the local minimum peaks. - - Examples - -------- - >>> s = pl.Series("a", [4, 1, 3, 2, 5]) - >>> s.peak_min() - shape: (5,) - Series: \'a\' [bool] - [ - false - true - false - true - false - ] - - ''' - def n_unique(self) -> int: - ''' - Count the number of unique values in this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.n_unique() - 3 - - ''' - def shrink_to_fit(self) -> Series: - """ - Shrink Series memory usage. - - Shrinks the underlying array capacity to exactly fit the actual data. - (Note that this function does not change the Series data type). - - """ - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: - ''' - Hash the Series. - - The hash value is of type `UInt64`. - - Parameters - ---------- - seed - Random seed parameter. Defaults to 0. - seed_1 - Random seed parameter. Defaults to `seed` if not set. - seed_2 - Random seed parameter. Defaults to `seed` if not set. - seed_3 - Random seed parameter. Defaults to `seed` if not set. - - Notes - ----- - This implementation of :func:`hash` does not guarantee stable results - across different Polars versions. Its stability is only guaranteed within a - single version. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.hash(seed=42) # doctest: +IGNORE_RESULT - shape: (3,) - Series: \'a\' [u64] - [ - 10734580197236529959 - 3022416320763508302 - 13756996518000038261 - ] - - ''' - def reinterpret(self) -> Series: - """ - Reinterpret the underlying bits as a signed/unsigned integer. - - This operation is only allowed for 64bit integers. For lower bits integers, - you can safely use that cast operation. - - Parameters - ---------- - signed - If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. - - """ - def interpolate(self, method: InterpolationMethod = ...) -> Series: - ''' - Fill null values using interpolation. - - Parameters - ---------- - method : {\'linear\', \'nearest\'} - Interpolation method. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, None, None, 5]) - >>> s.interpolate() - shape: (5,) - Series: \'a\' [f64] - [ - 1.0 - 2.0 - 3.0 - 4.0 - 5.0 - ] - - ''' - def abs(self) -> Series: - """ - Compute absolute values. - - Same as `abs(series)`. - """ - def rank(self, method: RankMethod = ...) -> Series: - ''' - Assign ranks to data, dealing with ties appropriately. - - Parameters - ---------- - method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} - The method used to assign ranks to tied elements. - The following methods are available (default is \'average\'): - - - \'average\' : The average of the ranks that would have been assigned to - all the tied values is assigned to each value. - - \'min\' : The minimum of the ranks that would have been assigned to all - the tied values is assigned to each value. (This is also referred to - as "competition" ranking.) - - \'max\' : The maximum of the ranks that would have been assigned to all - the tied values is assigned to each value. - - \'dense\' : Like \'min\', but the rank of the next highest element is - assigned the rank immediately after those assigned to the tied - elements. - - \'ordinal\' : All values are given a distinct rank, corresponding to - the order that the values occur in the Series. - - \'random\' : Like \'ordinal\', but the rank for ties is not dependent - on the order that the values occur in the Series. - descending - Rank in descending order. - seed - If `method="random"`, use this as seed. - - Examples - -------- - The \'average\' method: - - >>> s = pl.Series("a", [3, 6, 1, 1, 6]) - >>> s.rank() - shape: (5,) - Series: \'a\' [f64] - [ - 3.0 - 4.5 - 1.5 - 1.5 - 4.5 - ] - - The \'ordinal\' method: - - >>> s = pl.Series("a", [3, 6, 1, 1, 6]) - >>> s.rank("ordinal") - shape: (5,) - Series: \'a\' [u32] - [ - 3 - 4 - 1 - 2 - 5 - ] - - ''' - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: - ''' - Calculate the first discrete difference between shifted items. - - Parameters - ---------- - n - Number of slots to shift. - null_behavior : {\'ignore\', \'drop\'} - How to handle null values. - - Examples - -------- - >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) - >>> s.diff() - shape: (5,) - Series: \'s\' [i8] - [ - null - -10 - 20 - -5 - 10 - ] - - >>> s.diff(n=2) - shape: (5,) - Series: \'s\' [i8] - [ - null - null - 10 - 15 - 5 - ] - - >>> s.diff(n=2, null_behavior="drop") - shape: (3,) - Series: \'s\' [i8] - [ - 10 - 15 - 5 - ] - - ''' - def pct_change(self, n: int | IntoExprColumn = ...) -> Series: - """ - Computes percentage change between values. - - Percentage change (as fraction) between current element and most-recent - non-null element at least `n` period(s) before the current element. - - Computes the change from the previous row by default. - - Parameters - ---------- - n - periods to shift for forming percent change. - - Examples - -------- - >>> pl.Series(range(10)).pct_change() - shape: (10,) - Series: '' [f64] - [ - null - inf - 1.0 - 0.5 - 0.333333 - 0.25 - 0.2 - 0.166667 - 0.142857 - 0.125 - ] - - >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) - shape: (10,) - Series: '' [f64] - [ - null - null - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - ] - - """ - def skew(self) -> float | None: - """ - Compute the sample skewness of a data set. - - For normally distributed data, the skewness should be about zero. For - unimodal continuous distributions, a skewness value greater than zero means - that there is more weight in the right tail of the distribution. The - function `skewtest` can be used to determine if the skewness value - is close enough to zero, statistically speaking. - - - See scipy.stats for more information. - - Parameters - ---------- - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - Notes - ----- - The sample skewness is computed as the Fisher-Pearson coefficient - of skewness, i.e. - - .. math:: g_1=\\frac{m_3}{m_2^{3/2}} - - where - - .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i - - is the biased sample :math:`i\\texttt{th}` central moment, and - :math:`\\bar{x}` is - the sample mean. If `bias` is False, the calculations are - corrected for bias and the value computed is the adjusted - Fisher-Pearson standardized moment coefficient, i.e. - - .. math:: - G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} - - """ - def kurtosis(self) -> float | None: - """ - Compute the kurtosis (Fisher or Pearson) of a dataset. - - Kurtosis is the fourth central moment divided by the square of the - variance. If Fisher's definition is used, then 3.0 is subtracted from - the result to give 0.0 for a normal distribution. - If bias is False then the kurtosis is calculated using k statistics to - eliminate bias coming from biased moment estimators - - See scipy.stats for more information - - Parameters - ---------- - fisher : bool, optional - If True, Fisher's definition is used (normal ==> 0.0). If False, - Pearson's definition is used (normal ==> 3.0). - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - """ - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: - """ - Set values outside the given boundaries to the boundary value. - - Parameters - ---------- - lower_bound - Lower bound. Accepts expression input. - Non-expression inputs are parsed as literals. - If set to `None` (default), no lower bound is applied. - upper_bound - Upper bound. Accepts expression input. - Non-expression inputs are parsed as literals. - If set to `None` (default), no upper bound is applied. - - See Also - -------- - when - - Notes - ----- - This method only works for numeric and temporal columns. To clip other data - types, consider writing a `when-then-otherwise` expression. See :func:`when`. - - Examples - -------- - Specifying both a lower and upper bound: - - >>> s = pl.Series([-50, 5, 50, None]) - >>> s.clip(1, 10) - shape: (4,) - Series: '' [i64] - [ - 1 - 5 - 10 - null - ] - - Specifying only a single bound: - - >>> s.clip(upper_bound=10) - shape: (4,) - Series: '' [i64] - [ - -50 - 5 - 10 - null - ] - - """ - def lower_bound(self) -> Self: - ''' - Return the lower bound of this Series\' dtype as a unit Series. - - See Also - -------- - upper_bound : return the upper bound of the given Series\' dtype. - - Examples - -------- - >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) - >>> s.lower_bound() - shape: (1,) - Series: \'s\' [i32] - [ - -2147483648 - ] - - >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) - >>> s.lower_bound() - shape: (1,) - Series: \'s\' [f32] - [ - -inf - ] - - ''' - def upper_bound(self) -> Self: - ''' - Return the upper bound of this Series\' dtype as a unit Series. - - See Also - -------- - lower_bound : return the lower bound of the given Series\' dtype. - - Examples - -------- - >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) - >>> s.upper_bound() - shape: (1,) - Series: \'s\' [i8] - [ - 127 - ] - - >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) - >>> s.upper_bound() - shape: (1,) - Series: \'s\' [f64] - [ - inf - ] - - ''' - def replace(self, mapping: dict[Any, Any]) -> Self: - ''' - Replace values according to the given mapping. - - Needs a global string cache for lazily evaluated queries on columns of - type `Categorical`. - - Parameters - ---------- - mapping - Mapping of values to their replacement. - default - Value to use when the mapping does not contain the lookup value. - Defaults to keeping the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - - See Also - -------- - str.replace - - Examples - -------- - Replace a single value by another value. Values not in the mapping remain - unchanged. - - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.replace({2: 100}) - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 100 - 100 - 3 - ] - - Replace multiple values. Specify a default to set values not in the given map - to the default value. - - >>> s = pl.Series("country_code", ["FR", "ES", "DE", None]) - >>> country_code_map = { - ... "CA": "Canada", - ... "DE": "Germany", - ... "FR": "France", - ... None: "unspecified", - ... } - >>> s.replace(country_code_map, default=None) - shape: (4,) - Series: \'country_code\' [str] - [ - "France" - null - "Germany" - "unspecified" - ] - - The return type can be overridden with the `return_dtype` argument. - - >>> s = pl.Series("a", [0, 1, 2, 3]) - >>> s.replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) - shape: (4,) - Series: \'a\' [u8] - [ - 0 - 10 - 20 - 0 - ] - ''' - def reshape(self, dimensions: tuple[int, ...]) -> Series: - ''' - Reshape this Series to a flat Series or a Series of Lists. - - Parameters - ---------- - dimensions - Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that - dimension is inferred. - - Returns - ------- - Series - If a single dimension is given, results in a Series of the original - data type. - If a multiple dimensions are given, results in a Series of data type - :class:`List` with shape (rows, cols). - - See Also - -------- - Series.list.explode : Explode a list column. - - Examples - -------- - >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) - >>> s.reshape((3, 3)) - shape: (3,) - Series: \'foo\' [list[i64]] - [ - [1, 2, 3] - [4, 5, 6] - [7, 8, 9] - ] - - ''' - def shuffle(self, seed: int | None = ...) -> Series: - ''' - Shuffle the contents of this Series. - - Parameters - ---------- - seed - Seed for the random number generator. If set to None (default), a - random seed is generated each time the shuffle is called. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.shuffle(seed=1) - shape: (3,) - Series: \'a\' [i64] - [ - 2 - 1 - 3 - ] - - ''' - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: - """ - Exponentially-weighted moving average. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> s = pl.Series([1, 2, 3]) - >>> s.ewm_mean(com=1) - shape: (3,) - Series: '' [f64] - [ - 1.0 - 1.666667 - 2.428571 - ] - - """ - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: - ''' - Exponentially-weighted moving standard deviation. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.ewm_std(com=1) - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 0.707107 - 0.963624 - ] - - ''' - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: - ''' - Exponentially-weighted moving variance. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.ewm_var(com=1) - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 0.5 - 0.928571 - ] - - ''' - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: - """ - Extremely fast method for extending the Series with 'n' copies of a value. - - Parameters - ---------- - value - A constant literal value (not an expression) with which to extend - the Series; can pass None to extend with nulls. - n - The number of additional values that will be added. - - Examples - -------- - >>> s = pl.Series([1, 2, 3]) - >>> s.extend_constant(99, n=2) - shape: (5,) - Series: '' [i64] - [ - 1 - 2 - 3 - 99 - 99 - ] - - """ - def set_sorted(self) -> Self: - ''' - Flags the Series as \'sorted\'. - - Enables downstream code to user fast paths for sorted arrays. - - Parameters - ---------- - descending - If the `Series` order is descending. - - Warnings - -------- - This can lead to incorrect results if this `Series` is not sorted!! - Use with care! - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.set_sorted().max() - 3 - - ''' - def new_from_index(self, index: int, length: int) -> Self: - """Create a new Series filled with values from the given index.""" - def shrink_dtype(self) -> Series: - """ - Shrink numeric columns to the minimal required datatype. - - Shrink to the dtype needed to fit the extrema of this [`Series`]. - This can be used to reduce memory pressure. - """ - def get_chunks(self) -> list[Series]: - """Get the chunks of this Series as a list of Series.""" - def implode(self) -> Self: - """Aggregate values into a list.""" - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - """ - Apply a custom/user-defined function (UDF) over elements in this Series. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Series.map_elements`. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output datatype. If none is given, the same datatype as this Series will be - used. - skip_nulls - Nulls will be skipped and not passed to the python function. - This is faster because python can be skipped and because we call - more specialized functions. - - """ - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - """ - Apply a custom rolling window function. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Series.rolling_map`. - - Parameters - ---------- - function - Aggregation function - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - """ - def is_first(self) -> Series: - """ - Return a boolean mask indicating the first occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Series.is_first_distinct`. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - """ - def is_last(self) -> Series: - """ - Return a boolean mask indicating the last occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Series.is_last_distinct`. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - """ - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: - """ - Clip (limit) the values in an array to a `min` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - lower_bound - Lower bound. - - """ - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: - """ - Clip (limit) the values in an array to a `max` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - upper_bound - Upper bound. - - """ - def shift_and_fill(self, fill_value: int | Expr) -> Series: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - Fill None values with the result of this expression. - n - Number of places to shift (may be negative). - - """ - def is_float(self) -> bool: - ''' - Check if this Series has floating point numbers. - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_float()` instead. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0]) - >>> s.is_float() # doctest: +SKIP - True - - ''' - def is_integer(self, signed: bool | None = ...) -> bool: - ''' - Check if this Series datatype is an integer (signed or unsigned). - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_integer()` instead. - For signed/unsigned variants, use `Series.dtype.is_signed_integer()` - or `Series.dtype.is_unsigned_integer()`. - - Parameters - ---------- - signed - * if `None`, both signed and unsigned integer dtypes will match. - * if `True`, only signed integer dtypes will be considered a match. - * if `False`, only unsigned integer dtypes will be considered a match. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) - >>> s.is_integer() # doctest: +SKIP - True - >>> s.is_integer(signed=False) # doctest: +SKIP - True - >>> s.is_integer(signed=True) # doctest: +SKIP - False - - ''' - def is_numeric(self) -> bool: - ''' - Check if this Series datatype is numeric. - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_float()` instead. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.is_numeric() # doctest: +SKIP - True - - ''' - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: - """ - Check if this Series datatype is temporal. - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_temporal()` instead. - - Parameters - ---------- - excluding - Optionally exclude one or more temporal dtypes from matching. - - Examples - -------- - >>> from datetime import date - >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) - >>> s.is_temporal() # doctest: +SKIP - True - >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP - False - - """ - def is_boolean(self) -> bool: - ''' - Check if this Series is a Boolean. - - .. deprecated:: 0.19.14 - Use `Series.dtype == pl.Boolean` instead. - - Examples - -------- - >>> s = pl.Series("a", [True, False, True]) - >>> s.is_boolean() # doctest: +SKIP - True - - ''' - def is_utf8(self) -> bool: - ''' - Check if this Series datatype is a Utf8. - - .. deprecated:: 0.19.14 - Use `Series.dtype == pl.Utf8` instead. - - Examples - -------- - >>> s = pl.Series("x", ["a", "b", "c"]) - >>> s.is_utf8() # doctest: +SKIP - True - - ''' - def take_every(self, n: int) -> Series: - """ - Take every nth value in the Series and return as new Series. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: - """ - Take values by index. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather`. - - Parameters - ---------- - indices - Index location used for selection. - """ - def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: - """ - Set values at the index locations. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`scatter`. - - Parameters - ---------- - indices - Integers representing the index locations. - values - Replacement values. - """ - def cumsum(self) -> Series: - """ - Get an array with the cumulative sum computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_sum`. - - Parameters - ---------- - reverse - reverse the operation. - - """ - def cummax(self) -> Series: - """ - Get an array with the cumulative max computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_max`. - - Parameters - ---------- - reverse - reverse the operation. - """ - def cummin(self) -> Series: - """ - Get an array with the cumulative min computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_min`. - - Parameters - ---------- - reverse - reverse the operation. - """ - def cumprod(self) -> Series: - """ - Get an array with the cumulative product computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_prod`. - - Parameters - ---------- - reverse - reverse the operation. - """ - def view(self) -> SeriesView: - """ - Get a view into this Series data with a numpy array. - - .. deprecated:: 0.19.14 - This method will be removed in a future version. - - This operation doesn't clone data, but does not include missing values. - Don't use this unless you know what you are doing. - - Parameters - ---------- - ignore_nulls - If True then nulls are converted to 0. - If False then an Exception is raised if nulls are present. - - """ - def map_dict(self, mapping: dict[Any, Any]) -> Self: - """ - Replace values in the Series using a remapping dictionary. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`replace`. The default behavior - has changed to keep any values not present in the mapping unchanged. - Pass `default=None` to keep existing behavior. - - Parameters - ---------- - mapping - Dictionary containing the before/after values to map. - default - Value to use when the remapping dict does not contain the lookup value. - Use `pl.first()`, to keep the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - """ - def series_equal(self, other: Series) -> bool: - """ - Check whether the Series is equal to another Series. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`equals`. - - Parameters - ---------- - other - Series to compare with. - null_equal - Consider null values as equal. - strict - Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a - `pl.Int64` will return `False`. - """ - @property - def dtype(self): ... - @property - def flags(self): ... - @property - def inner_dtype(self): ... - @property - def name(self): ... - @property - def shape(self): ... - @property - def bin(self): ... - @property - def cat(self): ... - @property - def dt(self): ... - @property - def list(self): ... - @property - def arr(self): ... - @property - def str(self): ... - @property - def struct(self): ... -def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: - """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/series/series.pyi similarity index 99% rename from polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/series/series rename to polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/series/series.pyi index 4a40006..c28f651 100644 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/series/series +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.18/polars/series/series.pyi @@ -1,3 +1,4 @@ +#: version 0.19.18 import np as np import pa as pa import pd as pd @@ -4761,7 +4762,7 @@ class Series: Check if this Series datatype is numeric. .. deprecated:: 0.19.13 - Use `Series.dtype.is_float()` instead. + Use `Series.dtype.is_numeric()` instead. Examples -------- diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/dataframe/frame deleted file mode 100644 index 562effd..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/dataframe/frame +++ /dev/null @@ -1,6977 +0,0 @@ -import P -import deltalake -import np as np -import pa as pa -import pd as pd -from _io import BytesIO, TextIOWrapper - -from builtins import PyDataFrame -from pathlib import Path -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes.classes import Boolean as Boolean, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 -from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.col import col as col -from polars.functions.lit import lit as lit -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte -from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors -from polars.slice import PolarsSlice as PolarsSlice -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence - -TYPE_CHECKING: bool -INTEGER_DTYPES: frozenset -N_INFER_DEFAULT: int -_PYARROW_AVAILABLE: bool -_dtype_str_repr: builtin_function_or_method - -class DataFrame: - _accessors: _ClassVar[set] = ... - columns: Incomplete - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: - """Construct Polars DataFrame from FFI PyDataFrame object.""" - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from a dictionary of sequences. - - Parameters - ---------- - data : dict of sequences - Two-dimensional data represented as a dictionary. dict must contain - Sequences. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - - """ - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from a sequence of sequences. - - Parameters - ---------- - data : Sequence of sequences - Two-dimensional data represented as a sequence of sequences. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - orient : {'col', 'row'}, default None - Whether to interpret two-dimensional data as columns or as rows. If None, - the orientation is inferred by matching the columns and data dimensions. If - this does not yield conclusive results, column orientation is used. - infer_schema_length - How many rows to scan to determine the column type. - - """ - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from a numpy ndarray. - - Parameters - ---------- - data : numpy ndarray - Two-dimensional data represented as a numpy ndarray. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - orient : {'col', 'row'}, default None - Whether to interpret two-dimensional data as columns or as rows. If None, - the orientation is inferred by matching the columns and data dimensions. If - this does not yield conclusive results, column orientation is used. - - """ - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from an Arrow table. - - This operation will be zero copy for the most part. Types that are not - supported by Polars may be cast to the closest supported type. - - Parameters - ---------- - data : arrow table, array, or sequence of sequences - Data representing an Arrow Table or Array. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - rechunk : bool, default True - Make sure that all data is in contiguous memory. - - """ - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a Polars DataFrame from a pandas DataFrame. - - Parameters - ---------- - data : pandas DataFrame - Two-dimensional data represented as a pandas DataFrame. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - rechunk : bool, default True - Make sure that all data is in contiguous memory. - nan_to_null : bool, default True - If the data contains NaN values they will be converted to null/None. - include_index : bool, default False - Load any non-default pandas indexes as columns. - - """ - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: - """ - Read a CSV file into a DataFrame. - - Use `pl.read_csv` to dispatch to this method. - - See Also - -------- - polars.io.read_csv - - """ - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: - """ - Read into a DataFrame from a parquet file. - - Use `pl.read_parquet` to dispatch to this method. - - See Also - -------- - polars.io.read_parquet - - """ - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: - """ - Read into a DataFrame from Apache Avro format. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - columns - Columns. - n_rows - Stop reading from Apache Avro file after reading `n_rows`. - - """ - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: - ''' - Read into a DataFrame from Arrow IPC file format. - - See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. - Arrow IPC files are also known as Feather (v2) files. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - columns - Columns to select. Accepts a list of column indices (starting at zero) or a - list of column names. - n_rows - Stop reading from IPC file after reading `n_rows`. - row_count_name - Row count name. - row_count_offset - Row count offset. - rechunk - Make sure that all data is contiguous. - memory_map - Memory map the file - - ''' - @classmethod - def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: - ''' - Read into a DataFrame from Arrow IPC record batch stream format. - - See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - columns - Columns to select. Accepts a list of column indices (starting at zero) or a - list of column names. - n_rows - Stop reading from IPC stream after reading `n_rows`. - row_count_name - Row count name. - row_count_offset - Row count offset. - rechunk - Make sure that all data is contiguous. - - ''' - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: - """ - Read into a DataFrame from a JSON file. - - Use `pl.read_json` to dispatch to this method. - - See Also - -------- - polars.io.read_json - - """ - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: - """ - Read into a DataFrame from a newline delimited JSON file. - - Use `pl.read_ndjson` to dispatch to this method. - - See Also - -------- - polars.io.read_ndjson - - """ - def _replace(self, column: str, new_column: Series) -> Self: - """Replace a column by a new Series (in place).""" - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: - """ - Numpy __array__ interface protocol. - - Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see - https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. - """ - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: - ''' - Convert to a dataframe object implementing the dataframe interchange protocol. - - Parameters - ---------- - nan_as_null - Overwrite null values in the data with `NaN`. - - .. warning:: - This functionality has not been implemented and the parameter will be - removed in a future version. - Setting this to `True` will raise a `NotImplementedError`. - allow_copy - Allow memory to be copied to perform the conversion. If set to `False`, - causes conversions that are not zero-copy to fail. - - Notes - ----- - Details on the Python dataframe interchange protocol: - https://data-apis.org/dataframe-protocol/latest/index.html - - Examples - -------- - Convert a Polars DataFrame to a generic dataframe object and access some - properties. - - >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) - >>> dfi = df.__dataframe__() - >>> dfi.num_rows() - 2 - >>> dfi.get_column(1).dtype - (, 64, \'g\', \'=\') - - ''' - def __dataframe_consortium_standard__(self) -> Any: - """ - Provide entry point to the Consortium DataFrame Standard API. - - This is developed and maintained outside of polars. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. - """ - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: - """Compare a DataFrame with another object.""" - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: - """Compare a DataFrame with another DataFrame.""" - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: - """Compare a DataFrame with a non-DataFrame object.""" - def _div(self, other: Any) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Series]: ... - def __reversed__(self) -> Iterator[Series]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: - """Get item. Does quite a lot. Read the comments.""" - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: - """ - Format output data in HTML for display in Jupyter Notebooks. - - Output rows and columns can be modified by setting the following ENVIRONMENT - variables: - - * POLARS_FMT_MAX_COLS: set the number of columns - * POLARS_FMT_MAX_ROWS: set the number of rows - - """ - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: - ''' - Return the DataFrame as a scalar, or return the element at the given row/column. - - Parameters - ---------- - row - Optional row index. - column - Optional column index or name. - - See Also - -------- - row: Get the values of a single row, either by index or by predicate. - - Notes - ----- - If row/col not provided, this is equivalent to `df[0,0]`, with a check that - the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> df.select((pl.col("a") * pl.col("b")).sum()).item() - 32 - >>> df.item(1, 1) - 5 - >>> df.item(2, "b") - 6 - - ''' - def to_arrow(self) -> pa.Table: - ''' - Collect the underlying arrow arrays in an Arrow Table. - - This operation is mostly zero copy. - - Data types that do copy: - - CategoricalType - - Examples - -------- - >>> df = pl.DataFrame( - ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} - ... ) - >>> df.to_arrow() - pyarrow.Table - foo: int64 - bar: large_string - ---- - foo: [[1,2,3,4,5,6]] - bar: [["a","b","c","d","e","f"]] - - ''' - def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: - ''' - Convert DataFrame to a dictionary mapping column name to values. - - Parameters - ---------- - as_series - True -> Values are Series - False -> Values are List[Any] - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4, 5], - ... "fruits": ["banana", "banana", "apple", "apple", "banana"], - ... "B": [5, 4, 3, 2, 1], - ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], - ... "optional": [28, 300, None, 2, -30], - ... } - ... ) - >>> df - shape: (5, 5) - ┌─────┬────────┬─────┬────────┬──────────┐ - │ A ┆ fruits ┆ B ┆ cars ┆ optional │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ - ╞═════╪════════╪═════╪════════╪══════════╡ - │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ - │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ - │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ - │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ - │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ - └─────┴────────┴─────┴────────┴──────────┘ - >>> df.to_dict(as_series=False) - {\'A\': [1, 2, 3, 4, 5], - \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], - \'B\': [5, 4, 3, 2, 1], - \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], - \'optional\': [28, 300, None, 2, -30]} - >>> df.to_dict(as_series=True) - {\'A\': shape: (5,) - Series: \'A\' [i64] - [ - 1 - 2 - 3 - 4 - 5 - ], \'fruits\': shape: (5,) - Series: \'fruits\' [str] - [ - "banana" - "banana" - "apple" - "apple" - "banana" - ], \'B\': shape: (5,) - Series: \'B\' [i64] - [ - 5 - 4 - 3 - 2 - 1 - ], \'cars\': shape: (5,) - Series: \'cars\' [str] - [ - "beetle" - "audi" - "beetle" - "beetle" - "beetle" - ], \'optional\': shape: (5,) - Series: \'optional\' [i64] - [ - 28 - 300 - null - 2 - -30 - ]} - - ''' - def to_dicts(self) -> list[dict[str, Any]]: - ''' - Convert every row to a dictionary of Python-native values. - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.to_dicts() - [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] - - ''' - def to_numpy(self) -> np.ndarray[Any, Any]: - ''' - Convert DataFrame to a 2D NumPy array. - - This operation clones data. - - Parameters - ---------- - structured - Optionally return a structured array, with field names and - dtypes that correspond to the DataFrame schema. - order - The index order of the returned NumPy array, either C-like or - Fortran-like. In general, using the Fortran-like index order is faster. - However, the C-like order might be more appropriate to use for downstream - applications to prevent cloning data, e.g. when reshaping into a - one-dimensional array. Note that this option only takes effect if - `structured` is set to `False` and the DataFrame dtypes allow for a - global dtype for all columns. - - Notes - ----- - If you\'re attempting to convert Utf8 to an array you\'ll need to install - `pyarrow`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.5, 7.0, 8.5], - ... "ham": ["a", "b", "c"], - ... }, - ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, - ... ) - - Export to a standard 2D numpy array. - - >>> df.to_numpy() - array([[1, 6.5, \'a\'], - [2, 7.0, \'b\'], - [3, 8.5, \'c\']], dtype=object) - - Export to a structured array, which can better-preserve individual - column data, such as name and dtype... - - >>> df.to_numpy(structured=True) - array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], - dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np - >>> df.to_numpy(structured=True).view(np.recarray) - rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], - dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: - ''' - Cast to a pandas DataFrame. - - This requires that :mod:`pandas` and :mod:`pyarrow` are installed. - This operation clones data, unless `use_pyarrow_extension_array=True`. - - Parameters - ---------- - use_pyarrow_extension_array - Use PyArrow backed-extension arrays instead of numpy arrays for each column - of the pandas DataFrame; this allows zero copy operations and preservation - of null values. Subsequent operations on the resulting pandas DataFrame may - trigger conversion to NumPy arrays if that operation is not supported by - pyarrow compute functions. - **kwargs - Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. - - Returns - ------- - :class:`pandas.DataFrame` - - Examples - -------- - >>> import pandas - >>> df1 = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> pandas_df1 = df1.to_pandas() - >>> type(pandas_df1) - - >>> pandas_df1.dtypes - foo int64 - bar int64 - ham object - dtype: object - >>> df2 = pl.DataFrame( - ... { - ... "foo": [1, 2, None], - ... "bar": [6, None, 8], - ... "ham": [None, "b", "c"], - ... } - ... ) - >>> pandas_df2 = df2.to_pandas() - >>> pandas_df2 - foo bar ham - 0 1.0 6.0 None - 1 2.0 NaN b - 2 NaN 8.0 c - >>> pandas_df2.dtypes - foo float64 - bar float64 - ham object - dtype: object - >>> pandas_df2_pa = df2.to_pandas( - ... use_pyarrow_extension_array=True - ... ) # doctest: +SKIP - >>> pandas_df2_pa # doctest: +SKIP - foo bar ham - 0 1 6 - 1 2 b - 2 8 c - >>> pandas_df2_pa.dtypes # doctest: +SKIP - foo int64[pyarrow] - bar int64[pyarrow] - ham large_string[pyarrow] - dtype: object - - ''' - def to_series(self, index: int = ...) -> Series: - ''' - Select column as Series at index location. - - Parameters - ---------- - index - Location of selection. - - See Also - -------- - get_column - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.to_series(1) - shape: (3,) - Series: \'bar\' [i64] - [ - 6 - 7 - 8 - ] - - ''' - def to_init_repr(self, n: int = ...) -> str: - ''' - Convert DataFrame to instantiatable string representation. - - Parameters - ---------- - n - Only use first n rows. - - See Also - -------- - polars.Series.to_init_repr - polars.from_repr - - Examples - -------- - >>> df = pl.DataFrame( - ... [ - ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), - ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), - ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), - ... ] - ... ) - >>> print(df.to_init_repr()) - pl.DataFrame( - [ - pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), - pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), - pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), - ] - ) - - >>> df_from_str_repr = eval(df.to_init_repr()) - >>> df_from_str_repr - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ f32 ┆ cat │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 7.0 ┆ b │ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: - ''' - Serialize to JSON representation. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - If set to `None` (default), the output is returned as a string instead. - pretty - Pretty serialize json. - row_oriented - Write to row oriented json. This is slower, but more common. - - See Also - -------- - DataFrame.write_ndjson - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... } - ... ) - >>> df.write_json() - \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' - >>> df.write_json(row_oriented=True) - \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' - - ''' - def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: - ''' - Serialize to newline delimited JSON representation. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - If set to `None` (default), the output is returned as a string instead. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... } - ... ) - >>> df.write_ndjson() - \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' - - ''' - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: - ''' - Write to comma-separated values (CSV) file. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - If set to `None` (default), the output is returned as a string instead. - include_bom - Whether to include UTF-8 BOM in the CSV output. - include_header - Whether to include header in the CSV output. - separator - Separate CSV fields with this symbol. - line_terminator - String used to end each row. - quote_char - Byte to use as quoting character. - batch_size - Number of rows that will be processed per thread. - datetime_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. If no format specified, the default fractional-second - precision is inferred from the maximum timeunit found in the frame\'s - Datetime cols (if any). - date_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - time_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - float_precision - Number of decimal places to write, applied to both `Float32` and - `Float64` datatypes. - null_value - A string representing null values (defaulting to the empty string). - quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} - Determines the quoting strategy used. - - - necessary (default): This puts quotes around fields only when necessary. - They are necessary when fields contain a quote, - separator or record terminator. - Quotes are also necessary when writing an empty record - (which is indistinguishable from a record with one empty field). - This is the default. - - always: This puts quotes around every field. Always. - - never: This never puts quotes around fields, even if that results in - invalid CSV data (e.g.: by not quoting strings containing the separator). - - non_numeric: This puts quotes around all fields that are non-numeric. - Namely, when writing a field that does not parse as a valid float - or integer, then quotes will be used even if they aren`t strictly - necessary. - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.csv" - >>> df.write_csv(path, separator=",") - - ''' - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: - ''' - Write to Apache Avro file. - - Parameters - ---------- - file - File path or writeable file-like object to which the data will be written. - compression : {\'uncompressed\', \'snappy\', \'deflate\'} - Compression method. Defaults to "uncompressed". - name - Schema name. Defaults to empty string. - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.avro" - >>> df.write_avro(path) - - ''' - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: - ''' - Write frame data to a table in an Excel workbook/worksheet. - - Parameters - ---------- - workbook : Workbook - String name or path of the workbook to create, BytesIO object to write - into, or an open `xlsxwriter.Workbook` object that has not been closed. - If None, writes to a `dataframe.xlsx` workbook in the working directory. - worksheet : str - Name of target worksheet; if None, writes to "Sheet1" when creating a new - workbook (note that writing to an existing workbook requires a valid - existing -or new- worksheet name). - position : {str, tuple} - Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. - table_style : {str, dict} - A named Excel table style, such as "Table Style Medium 4", or a dictionary - of `{"key":value,}` options containing one or more of the following keys: - "style", "first_column", "last_column", "banded_columns, "banded_rows". - table_name : str - Name of the output table object in the worksheet; can then be referred to - in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. - column_formats : dict - A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an - Excel format string to the given columns. Formats defined here (such as - "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. - dtype_formats : dict - A `{dtype:str,}` dictionary that sets the default Excel format for the - given dtype. (This can be overridden on a per-column basis by the - `column_formats` param). It is also valid to use dtype groups such as - `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform - integer and float formats. - conditional_formats : dict - A dictionary of colname (or selector) keys to a format str, dict, or list - that defines conditional formatting options for the specified columns. - - * If supplying a string typename, should be one of the valid `xlsxwriter` - types such as "3_color_scale", "data_bar", etc. - * If supplying a dictionary you can make use of any/all `xlsxwriter` - supported options, including icon sets, formulae, etc. - * Supplying multiple columns as a tuple/key will apply a single format - across all columns - this is effective in creating a heatmap, as the - min/max values will be determined across the entire range, not per-column. - * Finally, you can also supply a list made up from the above options - in order to apply *more* than one conditional format to the same range. - header_format : dict - A `{key:value,}` dictionary of `xlsxwriter` format options to apply - to the table header row, such as `{"bold":True, "font_color":"#702963"}`. - column_totals : {bool, list, dict} - Add a column-total row to the exported table. - - * If True, all numeric columns will have an associated total using "sum". - * If passing a string, it must be one of the valid total function names - and all numeric columns will have an associated total using that function. - * If passing a list of colnames, only those given will have a total. - * For more control, pass a `{colname:funcname,}` dict. - - Valid total function names are "average", "count_nums", "count", "max", - "min", "std_dev", "sum", and "var". - column_widths : {dict, int} - A `{colname:int,}` or `{selector:int,}` dict or a single integer that - sets (or overrides if autofitting) table column widths, in integer pixel - units. If given as an integer the same value is used for all table columns. - row_totals : {dict, bool} - Add a row-total column to the right-hand side of the exported table. - - * If True, a column called "total" will be added at the end of the table - that applies a "sum" function row-wise across all numeric columns. - * If passing a list/sequence of column names, only the matching columns - will participate in the sum. - * Can also pass a `{colname:columns,}` dictionary to create one or - more total columns with distinct names, referencing different columns. - row_heights : {dict, int} - An int or `{row_index:int,}` dictionary that sets the height of the given - rows (if providing a dictionary) or all rows (if providing an integer) that - intersect with the table body (including any header and total row) in - integer pixel units. Note that `row_index` starts at zero and will be - the header row (unless `include_header` is False). - sparklines : dict - A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more - sparklines to be written into a new column in the table. - - * If passing a list of colnames (used as the source of the sparkline data) - the default sparkline settings are used (eg: line chart with no markers). - * For more control an `xlsxwriter`-compliant options dict can be supplied, - in which case three additional polars-specific keys are available: - "columns", "insert_before", and "insert_after". These allow you to define - the source columns and position the sparkline(s) with respect to other - table columns. If no position directive is given, sparklines are added to - the end of the table (eg: to the far right) in the order they are given. - formulas : dict - A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or - more formulas to be written into a new column in the table. Note that you - are strongly advised to use structured references in your formulae wherever - possible to make it simple to reference columns by name. - - * If providing a string formula (such as "=[@colx]*[@coly]") the column will - be added to the end of the table (eg: to the far right), after any default - sparklines and before any row_totals. - * For the most control supply an options dictionary with the following keys: - "formula" (mandatory), one of "insert_before" or "insert_after", and - optionally "return_dtype". The latter is used to appropriately format the - output of the formula and allow it to participate in row/column totals. - float_precision : int - Default number of decimals displayed for floating point columns (note that - this is purely a formatting directive; the actual values are not rounded). - include_header : bool - Indicate if the table should be created with a header row. - autofilter : bool - If the table has headers, provide autofilter capability. - autofit : bool - Calculate individual column widths from the data. - hidden_columns : list - A list or selector representing table columns to hide in the worksheet. - hide_gridlines : bool - Do not display any gridlines on the output worksheet. - sheet_zoom : int - Set the default zoom level of the output worksheet. - freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) - Freeze workbook panes. - - * If (row, col) is supplied, panes are split at the top-left corner of the - specified cell, which are 0-indexed. Thus, to freeze only the top row, - supply (1, 0). - * Alternatively, cell notation can be used to supply the cell. For example, - "A2" indicates the split occurs at the top-left of cell A2, which is the - equivalent of (1, 0). - * If (row, col, top_row, top_col) are supplied, the panes are split based on - the `row` and `col`, and the scrolling region is inititalized to begin at - the `top_row` and `top_col`. Thus, to freeze only the top row and have the - scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). - Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. - - Notes - ----- - * A list of compatible `xlsxwriter` format property names can be found here: - https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties - - * Conditional formatting dictionaries should provide xlsxwriter-compatible - definitions; polars will take care of how they are applied on the worksheet - with respect to the relative sheet/column position. For supported options, - see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html - - * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible - key/values, as well as a mandatory polars "columns" key that defines the - sparkline source data; these source columns should all be adjacent. Two other - polars-specific keys are available to help define where the sparkline appears - in the table: "insert_after", and "insert_before". The value associated with - these keys should be the name of a column in the exported table. - https://xlsxwriter.readthedocs.io/working_with_sparklines.html - - * Formula dictionaries *must* contain a key called "formula", and then optional - "insert_after", "insert_before", and/or "return_dtype" keys. These additional - keys allow the column to be injected into the table at a specific location, - and/or to define the return type of the formula (eg: "Int64", "Float64", etc). - Formulas that refer to table columns should use Excel\'s structured references - syntax to ensure the formula is applied correctly and is table-relative. - https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e - - Examples - -------- - Instantiate a basic DataFrame: - - >>> from random import uniform - >>> from datetime import date - >>> - >>> df = pl.DataFrame( - ... { - ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], - ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], - ... "val": [10_000, 20_000, 30_000], - ... } - ... ) - - Export to "dataframe.xlsx" (the default workbook name, if not specified) in the - working directory, add column totals ("sum" by default) on all numeric columns, - then autofit: - - >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP - - Write frame to a specific location on the sheet, set a named table style, - apply US-style date formatting, increase default float precision, apply a - non-default total function to a single column, autofit: - - >>> df.write_excel( # doctest: +SKIP - ... position="B4", - ... table_style="Table Style Light 16", - ... dtype_formats={pl.Date: "mm/dd/yyyy"}, - ... column_totals={"num": "average"}, - ... float_precision=6, - ... autofit=True, - ... ) - - Write the same frame to a named worksheet twice, applying different styles - and conditional formatting to each table, adding table titles using explicit - xlsxwriter integration: - - >>> from xlsxwriter import Workbook - >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP - ... # basic/default conditional formatting - ... df.write_excel( - ... workbook=wb, - ... worksheet="data", - ... position=(3, 1), # specify position as (row,col) coordinates - ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, - ... table_style="Table Style Medium 4", - ... ) - ... - ... # advanced conditional formatting, custom styles - ... df.write_excel( - ... workbook=wb, - ... worksheet="data", - ... position=(len(df) + 7, 1), - ... table_style={ - ... "style": "Table Style Light 4", - ... "first_column": True, - ... }, - ... conditional_formats={ - ... "num": { - ... "type": "3_color_scale", - ... "min_color": "#76933c", - ... "mid_color": "#c4d79b", - ... "max_color": "#ebf1de", - ... }, - ... "val": { - ... "type": "data_bar", - ... "data_bar_2010": True, - ... "bar_color": "#9bbb59", - ... "bar_negative_color_same": True, - ... "bar_negative_border_color_same": True, - ... }, - ... }, - ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, - ... column_widths={"val": 125}, - ... autofit=True, - ... ) - ... - ... # add some table titles (with a custom format) - ... ws = wb.get_worksheet_by_name("data") - ... fmt_title = wb.add_format( - ... { - ... "font_color": "#4f6228", - ... "font_size": 12, - ... "italic": True, - ... "bold": True, - ... } - ... ) - ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) - ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) - ... - - Export a table containing two different types of sparklines. Use default - options for the "trend" sparkline and customised options (and positioning) - for the "+/-" win_loss sparkline, with non-default integer dtype formatting, - column totals, a subtle two-tone heatmap and hidden worksheet gridlines: - - >>> df = pl.DataFrame( - ... { - ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], - ... "q1": [100, 55, -20, 0, 35], - ... "q2": [30, -10, 15, 60, 20], - ... "q3": [-50, 0, 40, 80, 80], - ... "q4": [75, 55, 25, -10, -55], - ... } - ... ) - >>> df.write_excel( # doctest: +SKIP - ... table_style="Table Style Light 2", - ... # apply accounting format to all flavours of integer - ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, - ... sparklines={ - ... # default options; just provide source cols - ... "trend": ["q1", "q2", "q3", "q4"], - ... # customised sparkline type, with positioning directive - ... "+/-": { - ... "columns": ["q1", "q2", "q3", "q4"], - ... "insert_after": "id", - ... "type": "win_loss", - ... }, - ... }, - ... conditional_formats={ - ... # create a unified multi-column heatmap - ... ("q1", "q2", "q3", "q4"): { - ... "type": "2_color_scale", - ... "min_color": "#95b3d7", - ... "max_color": "#ffffff", - ... }, - ... }, - ... column_totals=["q1", "q2", "q3", "q4"], - ... row_totals=True, - ... hide_gridlines=True, - ... ) - - Export a table containing an Excel formula-based column that calculates a - standardised Z-score, showing use of structured references in conjunction - with positioning directives, column totals, and custom formatting. - - >>> df = pl.DataFrame( - ... { - ... "id": ["a123", "b345", "c567", "d789", "e101"], - ... "points": [99, 45, 50, 85, 35], - ... } - ... ) - >>> df.write_excel( # doctest: +SKIP - ... table_style={ - ... "style": "Table Style Medium 15", - ... "first_column": True, - ... }, - ... column_formats={ - ... "id": {"font": "Consolas"}, - ... "points": {"align": "center"}, - ... "z-score": {"align": "center"}, - ... }, - ... column_totals="average", - ... formulas={ - ... "z-score": { - ... # use structured references to refer to the table columns and \'totals\' row - ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", - ... "insert_after": "points", - ... "return_dtype": pl.Float64, - ... } - ... }, - ... hide_gridlines=True, - ... sheet_zoom=125, - ... ) - - ''' - def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: - ''' - Write to Arrow IPC binary stream or Feather file. - - See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. - - Parameters - ---------- - file - Path or writeable file-like object to which the IPC data will be - written. If set to `None`, the output is returned as a BytesIO object. - compression : {\'uncompressed\', \'lz4\', \'zstd\'} - Compression method. Defaults to "uncompressed". - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.arrow" - >>> df.write_ipc(path) - - ''' - def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: - ''' - Write to Arrow IPC record batch stream. - - See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. - - Parameters - ---------- - file - Path or writeable file-like object to which the IPC record batch data will - be written. If set to `None`, the output is returned as a BytesIO object. - compression : {\'uncompressed\', \'lz4\', \'zstd\'} - Compression method. Defaults to "uncompressed". - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.arrow" - >>> df.write_ipc_stream(path) - - ''' - def write_parquet(self, file: str | Path | BytesIO) -> None: - ''' - Write to Apache Parquet file. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} - Choose "zstd" for good compression performance. - Choose "lz4" for fast compression/decompression. - Choose "snappy" for more backwards compatibility guarantees - when you deal with older parquet readers. - compression_level - The level of compression to use. Higher compression means smaller files on - disk. - - - "gzip" : min-level: 0, max-level: 10. - - "brotli" : min-level: 0, max-level: 11. - - "zstd" : min-level: 1, max-level: 22. - - statistics - Write statistics to the parquet headers. This requires extra compute. - row_group_size - Size of the row groups in number of rows. Defaults to 512^2 rows. - use_pyarrow - Use C++ parquet implementation vs Rust parquet implementation. - At the moment C++ supports more features. - pyarrow_options - Arguments passed to `pyarrow.parquet.write_table`. - - If you pass `partition_cols` here, the dataset will be written - using `pyarrow.parquet.write_to_dataset`. - The `partition_cols` parameter leads to write the dataset to a directory. - Similar to Spark\'s partitioned datasets. - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.parquet" - >>> df.write_parquet(path) - - We can use pyarrow with use_pyarrow_write_to_dataset=True - to write partitioned datasets. The following example will - write the first row to ../watermark=1/*.parquet and the - other rows to ../watermark=2/*.parquet. - - >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) - >>> path: pathlib.Path = dirpath / "partitioned_object" - >>> df.write_parquet( - ... path, - ... use_pyarrow=True, - ... pyarrow_options={"partition_cols": ["watermark"]}, - ... ) - - ''' - def write_database(self, table_name: str, connection: str) -> None: - ''' - Write a polars frame to a database. - - Parameters - ---------- - table_name - Name of the table to create or append to in the target SQL database. - If your table name contains special characters, it should be quoted. - connection - Connection URI string, for example: - - * "postgresql://user:pass@server:port/database" - * "sqlite:////path/to/database.db" - if_exists : {\'append\', \'replace\', \'fail\'} - The insert mode. - \'replace\' will create a new database table, overwriting an existing one. - \'append\' will append to an existing table. - \'fail\' will fail if table already exists. - engine : {\'sqlalchemy\', \'adbc\'} - Select the engine used for writing the data. - ''' - def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: - ''' - Write DataFrame as delta table. - - Parameters - ---------- - target - URI of a table or a DeltaTable object. - mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} - How to handle existing data. - - * If \'error\', throw an error if the table already exists (default). - * If \'append\', will add new data. - * If \'overwrite\', will replace table with new data. - * If \'ignore\', will not write anything if table already exists. - overwrite_schema - If True, allows updating the schema of the table. - storage_options - Extra options for the storage backends supported by `deltalake`. - For cloud storages, this may include configurations for authentication etc. - - * See a list of supported storage options for S3 `here `__. - * See a list of supported storage options for GCS `here `__. - * See a list of supported storage options for Azure `here `__. - delta_write_options - Additional keyword arguments while writing a Delta lake Table. - See a list of supported write options `here `__. - - Raises - ------ - TypeError - If the DataFrame contains unsupported data types. - ArrowInvalidError - If the DataFrame contains data types that could not be cast to their - primitive type. - - Notes - ----- - The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` - are not supported by the delta protocol specification and will raise a - TypeError. - - Some other data types are not supported but have an associated `primitive type - `__ - to which they can be cast. This affects the following data types: - - - Unsigned integers - - :class:`Datetime` types with millisecond or nanosecond precision or with - time zone information - - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) - - Polars columns are always nullable. To write data to a delta table with - non-nullable columns, a custom pyarrow schema has to be passed to the - `delta_write_options`. See the last example below. - - Examples - -------- - Write a dataframe to the local filesystem as a Delta Lake table. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> table_path = "/path/to/delta-table/" - >>> df.write_delta(table_path) # doctest: +SKIP - - Append data to an existing Delta Lake table on the local filesystem. - Note that this will fail if the schema of the new data does not match the - schema of the existing table. - - >>> df.write_delta(table_path, mode="append") # doctest: +SKIP - - Overwrite a Delta Lake table as a new version. - If the schemas of the new and old data are the same, setting - `overwrite_schema` is not required. - - >>> existing_table_path = "/path/to/delta-table/" - >>> df.write_delta( - ... existing_table_path, mode="overwrite", overwrite_schema=True - ... ) # doctest: +SKIP - - Write a dataframe as a Delta Lake table to a cloud object store like S3. - - >>> table_path = "s3://bucket/prefix/to/delta-table/" - >>> df.write_delta( - ... table_path, - ... storage_options={ - ... "AWS_REGION": "THE_AWS_REGION", - ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", - ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", - ... }, - ... ) # doctest: +SKIP - - Write DataFrame as a Delta Lake table with non-nullable columns. - - >>> import pyarrow as pa - >>> existing_table_path = "/path/to/delta-table/" - >>> df.write_delta( - ... existing_table_path, - ... delta_write_options={ - ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) - ... }, - ... ) # doctest: +SKIP - - ''' - def estimated_size(self, unit: SizeUnit = ...) -> int | float: - ''' - Return an estimation of the total (heap) allocated size of the `DataFrame`. - - Estimated size is given in the specified unit (bytes by default). - - This estimation is the sum of the size of its buffers, validity, including - nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the - size of 2 arrays is not the sum of the sizes computed from this function. In - particular, [`StructArray`]\'s size is an upper bound. - - When an array is sliced, its allocated size remains constant because the buffer - unchanged. However, this function will yield a smaller number. This is because - this function returns the visible size of the buffer, not its total capacity. - - FFI buffers are included in this estimation. - - Parameters - ---------- - unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} - Scale the returned size to the given unit. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "x": list(reversed(range(1_000_000))), - ... "y": [v / 1000 for v in range(1_000_000)], - ... "z": [str(v) for v in range(1_000_000)], - ... }, - ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], - ... ) - >>> df.estimated_size() - 25888898 - >>> df.estimated_size("mb") - 24.689577102661133 - - ''' - def transpose(self) -> Self: - ''' - Transpose a DataFrame over the diagonal. - - Parameters - ---------- - include_header - If set, the column names will be added as first column. - header_name - If `include_header` is set, this determines the name of the column that will - be inserted. - column_names - Optional iterable yielding strings or a string naming an existing column. - These will name the value (non-header) columns in the transposed data. - - Notes - ----- - This is a very expensive operation. Perhaps you can do it differently. - - Returns - ------- - DataFrame - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) - >>> df.transpose(include_header=True) - shape: (2, 4) - ┌────────┬──────────┬──────────┬──────────┐ - │ column ┆ column_0 ┆ column_1 ┆ column_2 │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞════════╪══════════╪══════════╪══════════╡ - │ a ┆ 1 ┆ 2 ┆ 3 │ - │ b ┆ 1 ┆ 2 ┆ 3 │ - └────────┴──────────┴──────────┴──────────┘ - - Replace the auto-generated column names with a list - - >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 2 ┆ 3 │ - │ 1 ┆ 2 ┆ 3 │ - └─────┴─────┴─────┘ - - Include the header as a separate column - - >>> df.transpose( - ... include_header=True, header_name="foo", column_names=["a", "b", "c"] - ... ) - shape: (2, 4) - ┌─────┬─────┬─────┬─────┐ - │ foo ┆ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═════╡ - │ a ┆ 1 ┆ 2 ┆ 3 │ - │ b ┆ 1 ┆ 2 ┆ 3 │ - └─────┴─────┴─────┴─────┘ - - Replace the auto-generated column with column names from a generator function - - >>> def name_generator(): - ... base_name = "my_column_" - ... count = 0 - ... while True: - ... yield f"{base_name}{count}" - ... count += 1 - ... - >>> df.transpose(include_header=False, column_names=name_generator()) - shape: (2, 3) - ┌─────────────┬─────────────┬─────────────┐ - │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════════════╪═════════════╪═════════════╡ - │ 1 ┆ 2 ┆ 3 │ - │ 1 ┆ 2 ┆ 3 │ - └─────────────┴─────────────┴─────────────┘ - - Use an existing column as the new column names - - >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) - >>> df.transpose(column_names="id") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 3 ┆ 2 │ - │ 3 ┆ 4 ┆ 6 │ - └─────┴─────┴─────┘ - >>> df.transpose(include_header=True, header_name="new_id", column_names="id") - shape: (2, 4) - ┌────────┬─────┬─────┬─────┐ - │ new_id ┆ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞════════╪═════╪═════╪═════╡ - │ col1 ┆ 1 ┆ 3 ┆ 2 │ - │ col2 ┆ 3 ┆ 4 ┆ 6 │ - └────────┴─────┴─────┴─────┘ - ''' - def reverse(self) -> DataFrame: - ''' - Reverse the DataFrame. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "key": ["a", "b", "c"], - ... "val": [1, 2, 3], - ... } - ... ) - >>> df.reverse() - shape: (3, 2) - ┌─────┬─────┐ - │ key ┆ val │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ c ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 1 │ - └─────┴─────┘ - - ''' - def rename(self, mapping: dict[str, str]) -> DataFrame: - ''' - Rename column names. - - Parameters - ---------- - mapping - Key value pairs that map from old name to new name. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} - ... ) - >>> df.rename({"foo": "apple"}) - shape: (3, 3) - ┌───────┬─────┬─────┐ - │ apple ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═══════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └───────┴─────┴─────┘ - - ''' - def insert_column(self, index: int, column: Series) -> Self: - ''' - Insert a Series at a certain column index. - - This operation is in place. - - Parameters - ---------- - index - Index at which to insert the new `Series` column. - column - `Series` to insert. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> s = pl.Series("baz", [97, 98, 99]) - >>> df.insert_column(1, s) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ baz ┆ bar │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 97 ┆ 4 │ - │ 2 ┆ 98 ┆ 5 │ - │ 3 ┆ 99 ┆ 6 │ - └─────┴─────┴─────┘ - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) - >>> df.insert_column(3, s) - shape: (4, 4) - ┌─────┬──────┬───────┬──────┐ - │ a ┆ b ┆ c ┆ d │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 │ - ╞═════╪══════╪═══════╪══════╡ - │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ - │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ - │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ - │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ - └─────┴──────┴───────┴──────┘ - - ''' - def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: - ''' - Filter the rows in the DataFrame based on a predicate expression. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - predicates - Expression that evaluates to a boolean Series. - constraints - Column filters. Use name=value to filter column name by the supplied value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - - Filter on one condition: - - >>> df.filter(pl.col("foo") > 1) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Filter on multiple conditions, combined with and/or operators: - - >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Provide multiple filters using `*args` syntax: - - >>> df.filter( - ... pl.col("foo") <= 2, - ... ~pl.col("ham").is_in(["b", "c"]), - ... ) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Provide multiple filters using `**kwargs` syntax: - - >>> df.filter(foo=2, ham="b") - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - └─────┴─────┴─────┘ - - ''' - def glimpse(self) -> str | None: - ''' - Return a dense preview of the DataFrame. - - The formatting shows one line per column so that wide dataframes display - cleanly. Each line shows the column name, the data type, and the first - few values. - - Parameters - ---------- - max_items_per_column - Maximum number of items to show per column. - max_colname_length - Maximum length of the displayed column names; values that exceed this - value are truncated with a trailing ellipsis. - return_as_string - If True, return the preview as a string instead of printing to stdout. - - See Also - -------- - describe, head, tail - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... { - ... "a": [1.0, 2.8, 3.0], - ... "b": [4, 5, None], - ... "c": [True, False, True], - ... "d": [None, "b", "c"], - ... "e": ["usd", "eur", None], - ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], - ... } - ... ) - >>> df.glimpse() - Rows: 3 - Columns: 6 - $ a 1.0, 2.8, 3.0 - $ b 4, 5, None - $ c True, False, True - $ d None, \'b\', \'c\' - $ e \'usd\', \'eur\', None - $ f 2020-01-01, 2021-01-02, 2022-01-01 - - ''' - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: - ''' - Summary statistics for a DataFrame. - - Parameters - ---------- - percentiles - One or more percentiles to include in the summary statistics. - All values must be in the range `[0, 1]`. - - Notes - ----- - The median is included by default as the 50% percentile. - - See Also - -------- - glimpse - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... { - ... "a": [1.0, 2.8, 3.0], - ... "b": [4, 5, None], - ... "c": [True, False, True], - ... "d": [None, "b", "c"], - ... "e": ["usd", "eur", None], - ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], - ... } - ... ) - >>> df.describe() - shape: (9, 7) - ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ - │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ - ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ - │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ - │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ - │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ - │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ - │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ - │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ - │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ - │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ - │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ - └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ - - ''' - def get_column_index(self, name: str) -> int: - ''' - Find the index of a column by name. - - Parameters - ---------- - name - Name of the column to find. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} - ... ) - >>> df.get_column_index("ham") - 2 - - ''' - def replace_column(self, index: int, column: Series) -> Self: - ''' - Replace a column at an index location. - - This operation is in place. - - Parameters - ---------- - index - Column index. - column - Series that will replace the column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> s = pl.Series("apple", [10, 20, 30]) - >>> df.replace_column(0, s) - shape: (3, 3) - ┌───────┬─────┬─────┐ - │ apple ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═══════╪═════╪═════╡ - │ 10 ┆ 6 ┆ a │ - │ 20 ┆ 7 ┆ b │ - │ 30 ┆ 8 ┆ c │ - └───────┴─────┴─────┘ - ''' - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: - ''' - Sort the dataframe by the given columns. - - Parameters - ---------- - by - Column(s) to sort by. Accepts expression input. Strings are parsed as column - names. - *more_by - Additional columns to sort by, specified as positional arguments. - descending - Sort in descending order. When sorting by multiple columns, can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - - Examples - -------- - Pass a single column name to sort by that column. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [6.0, 5.0, 4.0], - ... "c": ["a", "c", "b"], - ... } - ... ) - >>> df.sort("a") - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - Sorting by expressions is also supported. - - >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - └──────┴─────┴─────┘ - - Sort by multiple columns by passing a list of columns. - - >>> df.sort(["c", "a"], descending=True) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - └──────┴─────┴─────┘ - - Or use positional arguments to sort by multiple columns in the same way. - - >>> df.sort("c", "a", descending=[False, True]) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - ''' - def top_k(self, k: int) -> DataFrame: - ''' - Return the `k` largest elements. - - If \'descending=True` the smallest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - See Also - -------- - bottom_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 largest values in column b. - - >>> df.top_k(4, by="b") - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ a ┆ 2 │ - │ b ┆ 2 │ - │ b ┆ 1 │ - └─────┴─────┘ - - Get the rows which contain the 4 largest values when sorting on column b and a. - - >>> df.top_k(4, by=["b", "a"]) - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 2 │ - │ c ┆ 1 │ - └─────┴─────┘ - - ''' - def bottom_k(self, k: int) -> DataFrame: - ''' - Return the `k` smallest elements. - - If \'descending=True` the largest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - See Also - -------- - top_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 smallest values in column b. - - >>> df.bottom_k(4, by="b") - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 1 │ - │ a ┆ 1 │ - │ c ┆ 1 │ - │ a ┆ 2 │ - └─────┴─────┘ - - Get the rows which contain the 4 smallest values when sorting on column a and b. - - >>> df.bottom_k(4, by=["a", "b"]) - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ b ┆ 1 │ - │ b ┆ 2 │ - └─────┴─────┘ - - ''' - def equals(self, other: DataFrame) -> bool: - ''' - Check whether the DataFrame is equal to another DataFrame. - - Parameters - ---------- - other - DataFrame to compare with. - null_equal - Consider null values as equal. - - See Also - -------- - assert_frame_equal - - Examples - -------- - >>> df1 = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df2 = pl.DataFrame( - ... { - ... "foo": [3, 2, 1], - ... "bar": [8.0, 7.0, 6.0], - ... "ham": ["c", "b", "a"], - ... } - ... ) - >>> df1.equals(df1) - True - >>> df1.equals(df2) - False - - ''' - def replace(self, column: str, new_column: Series) -> Self: - ''' - Replace a column by a new Series. - - Parameters - ---------- - column - Column to replace. - new_column - New column to insert. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> s = pl.Series([10, 20, 30]) - >>> df.replace("foo", s) # works in-place! # doctest: +SKIP - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 10 ┆ 4 │ - │ 20 ┆ 5 │ - │ 30 ┆ 6 │ - └─────┴─────┘ - - ''' - def slice(self, offset: int, length: int | None = ...) -> Self: - ''' - Get a slice of this DataFrame. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.slice(1, 2) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7.0 ┆ b │ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def head(self, n: int = ...) -> Self: - ''' - Get the first `n` rows. - - Parameters - ---------- - n - Number of rows to return. If a negative value is passed, return all rows - except the last `abs(n)`. - - See Also - -------- - tail, glimpse, slice - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> df.head(3) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Pass a negative value to get all rows `except` the last `abs(n)`. - - >>> df.head(-3) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - └─────┴─────┴─────┘ - - ''' - def tail(self, n: int = ...) -> Self: - ''' - Get the last `n` rows. - - Parameters - ---------- - n - Number of rows to return. If a negative value is passed, return all rows - except the first `abs(n)`. - - See Also - -------- - head, slice - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> df.tail(3) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8 ┆ c │ - │ 4 ┆ 9 ┆ d │ - │ 5 ┆ 10 ┆ e │ - └─────┴─────┴─────┘ - - Pass a negative value to get all rows `except` the first `abs(n)`. - - >>> df.tail(-3) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 4 ┆ 9 ┆ d │ - │ 5 ┆ 10 ┆ e │ - └─────┴─────┴─────┘ - - ''' - def limit(self, n: int = ...) -> Self: - """ - Get the first `n` rows. - - Alias for :func:`DataFrame.head`. - - Parameters - ---------- - n - Number of rows to return. If a negative value is passed, return all rows - except the last `abs(n)`. - - See Also - -------- - head - - """ - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: - ''' - Drop all rows that contain null values. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - subset - Column name(s) for which null values are considered. - If set to `None` (default), use all columns. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, None, 8], - ... "ham": ["a", "b", None], - ... } - ... ) - - The default behavior of this method is to drop rows where any single - value of the row is null. - - >>> df.drop_nulls() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - This behaviour can be constrained to consider only a subset of columns, as - defined by name or with a selector. For example, dropping rows if there is - a null in any of the integer columns: - - >>> import polars.selectors as cs - >>> df.drop_nulls(subset=cs.integer()) - shape: (2, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ null │ - └─────┴─────┴──────┘ - - Below are some additional examples that show how to drop null - values based on other conditions. - - >>> df = pl.DataFrame( - ... { - ... "a": [None, None, None, None], - ... "b": [1, 2, None, 1], - ... "c": [1, None, None, 1], - ... } - ... ) - >>> df - shape: (4, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪══════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ null ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴──────┴──────┘ - - Drop a row only if all values are null: - - >>> df.filter(~pl.all_horizontal(pl.all().is_null())) - shape: (3, 3) - ┌──────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪═════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴─────┴──────┘ - - Drop a column if all values are null: - - >>> df[[s.name for s in df if not (s.null_count() == df.height)]] - shape: (4, 2) - ┌──────┬──────┐ - │ b ┆ c │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ 1 ┆ 1 │ - │ 2 ┆ null │ - │ null ┆ null │ - │ 1 ┆ 1 │ - └──────┴──────┘ - - ''' - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: - ''' - Offers a structured way to apply a sequence of user-defined functions (UDFs). - - Parameters - ---------- - function - Callable; will receive the frame as the first parameter, - followed by any given args/kwargs. - *args - Arguments to pass to the UDF. - **kwargs - Keyword arguments to pass to the UDF. - - Notes - ----- - It is recommended to use LazyFrame when piping operations, in order - to fully take advantage of query optimization and parallelization. - See :meth:`df.lazy() `. - - Examples - -------- - >>> def cast_str_to_int(data, col_name): - ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) - ... - >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) - >>> df.pipe(cast_str_to_int, col_name="b") - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 10 │ - │ 2 ┆ 20 │ - │ 3 ┆ 30 │ - │ 4 ┆ 40 │ - └─────┴─────┘ - - >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) - >>> df - shape: (2, 2) - ┌─────┬─────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - └─────┴─────┘ - >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 1 │ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: - ''' - Add a column at index 0 that counts the rows. - - Parameters - ---------- - name - Name of the column to add. - offset - Start the row count at this offset. Default = 0 - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> df.with_row_count() - shape: (3, 3) - ┌────────┬─────┬─────┐ - │ row_nr ┆ a ┆ b │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ i64 ┆ i64 │ - ╞════════╪═════╪═════╡ - │ 0 ┆ 1 ┆ 2 │ - │ 1 ┆ 3 ┆ 4 │ - │ 2 ┆ 5 ┆ 6 │ - └────────┴─────┴─────┘ - - ''' - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: - ''' - Start a group by operation. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - .. note:: - Within each group, the order of rows is always preserved, regardless - of this argument. - - Returns - ------- - GroupBy - Object which can be used to perform aggregations. - - Examples - -------- - Group by one column and call `agg` to compute the grouped sum of another - column. - - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "c"], - ... "b": [1, 2, 1, 3, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 2 │ - │ b ┆ 5 │ - │ c ┆ 3 │ - └─────┴─────┘ - - Set `maintain_order=True` to ensure the order of the groups is consistent with - the input. - - >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) - shape: (3, 2) - ┌─────┬───────────┐ - │ a ┆ c │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════╪═══════════╡ - │ a ┆ [5, 3] │ - │ b ┆ [4, 2] │ - │ c ┆ [1] │ - └─────┴───────────┘ - - Group by multiple columns by passing a list of column names. - - >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - - Or use positional arguments to group by multiple columns in the same way. - Expressions are also accepted. - - >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ f64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 0 ┆ 4.0 │ - │ b ┆ 1 ┆ 3.0 │ - │ c ┆ 1 ┆ 1.0 │ - └─────┴─────┴─────┘ - - The `GroupBy` object returned by this method is iterable, returning the name - and data of each group. - - >>> for name, data in df.group_by("a"): # doctest: +SKIP - ... print(name) - ... print(data) - ... - a - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘ - b - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘ - c - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - - ''' - def rolling(self, index_column: IntoExpr) -> RollingGroupBy: - ''' - Create rolling groups based on a time, Int32, or Int64 column. - - Different from a `group_by_dynamic` the windows are now determined by the - individual values and are not of constant intervals. For constant intervals use - :func:`DataFrame.group_by_dynamic`. - - If you have a time series ``, then by default the - windows created will be - - * (t_0 - period, t_0] - * (t_1 - period, t_1] - * ... - * (t_n - period, t_n] - - whereas if you pass a non-default `offset`, then the windows will be - - * (t_0 + offset, t_0 + offset + period] - * (t_1 + offset, t_1 + offset + period] - * ... - * (t_n + offset, t_n + offset + period] - - The `period` and `offset` arguments are created either from a timedelta, or - by using the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a rolling operation on an integer column, the windows are defined by: - - - **"1i" # length 1** - - **"10i" # length 10** - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling operation on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - RollingGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - group_by_dynamic - - Examples - -------- - >>> dates = [ - ... "2020-01-01 13:45:48", - ... "2020-01-01 16:42:13", - ... "2020-01-01 16:45:09", - ... "2020-01-02 18:12:48", - ... "2020-01-03 19:45:32", - ... "2020-01-08 23:16:43", - ... ] - >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( - ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() - ... ) - >>> out = df.rolling(index_column="dt", period="2d").agg( - ... [ - ... pl.sum("a").alias("sum_a"), - ... pl.min("a").alias("min_a"), - ... pl.max("a").alias("max_a"), - ... ] - ... ) - >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] - >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] - >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] - >>> out - shape: (6, 4) - ┌─────────────────────┬───────┬───────┬───────┐ - │ dt ┆ sum_a ┆ min_a ┆ max_a │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞═════════════════════╪═══════╪═══════╪═══════╡ - │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ - │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ - │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ - │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ - │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ - │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ - └─────────────────────┴───────┴───────┴───────┘ - - ''' - def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - Time windows are calculated and rows are assigned to windows. Different from a - normal group by is that a row can be member of multiple groups. - By default, the windows look like: - - - [start, start + period) - - [start + every, start + every + period) - - [start + 2*every, start + 2*every + period) - - ... - - where `start` is determined by `start_by`, `offset`, and `every` (see parameter - descriptions below). - - .. warning:: - The index column must be sorted in ascending order. If `by` is passed, then - the index column must be sorted in ascending order within each group. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - - .. deprecated:: 0.19.4 - Use `label` instead. - include_boundaries - Add the lower and upper bound of the window to the "_lower_boundary" and - "_upper_boundary" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - label : {\'left\', \'right\', \'datapoint\'} - Define which label to use for the window: - - - \'left\': lower boundary of the window - - \'right\': upper boundary of the window - - \'datapoint\': the first value of the index column in the given window. - If you don\'t need the label to be at one of the boundaries, choose this - option for maximum performance - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - DynamicGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - rolling - - Notes - ----- - 1) If you\'re coming from pandas, then - - .. code-block:: python - - # polars - df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) - - is equivalent to - - .. code-block:: python - - # pandas - df.set_index("ts").resample("D")["value"].sum().reset_index() - - though note that, unlike pandas, polars doesn\'t add extra rows for empty - windows. If you need `index_column` to be evenly spaced, then please combine - with :func:`DataFrame.upsample`. - - 2) The `every`, `period` and `offset` arguments are created with - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a group_by_dynamic on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Examples - -------- - >>> from datetime import datetime - >>> df = pl.DataFrame( - ... { - ... "time": pl.datetime_range( - ... start=datetime(2021, 12, 16), - ... end=datetime(2021, 12, 16, 3), - ... interval="30m", - ... eager=True, - ... ), - ... "n": range(7), - ... } - ... ) - >>> df - shape: (7, 2) - ┌─────────────────────┬─────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i64 │ - ╞═════════════════════╪═════╡ - │ 2021-12-16 00:00:00 ┆ 0 │ - │ 2021-12-16 00:30:00 ┆ 1 │ - │ 2021-12-16 01:00:00 ┆ 2 │ - │ 2021-12-16 01:30:00 ┆ 3 │ - │ 2021-12-16 02:00:00 ┆ 4 │ - │ 2021-12-16 02:30:00 ┆ 5 │ - │ 2021-12-16 03:00:00 ┆ 6 │ - └─────────────────────┴─────┘ - - Group by windows of 1 hour starting at 2021-12-16 00:00:00. - - >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [1, 2] │ - │ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ 2021-12-16 02:00:00 ┆ [5, 6] │ - └─────────────────────┴───────────┘ - - The window boundaries can also be added to the aggregation result - - >>> df.group_by_dynamic( - ... "time", every="1h", include_boundaries=True, closed="right" - ... ).agg(pl.col("n").mean()) - shape: (4, 4) - ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ - │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ - ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ - │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ - │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ - │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ - │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ - └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ - - When closed="left", the window excludes the right end of interval: - [lower_bound, upper_bound) - - >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-16 00:00:00 ┆ [0, 1] │ - │ 2021-12-16 01:00:00 ┆ [2, 3] │ - │ 2021-12-16 02:00:00 ┆ [4, 5] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - When closed="both" the time values at the window boundaries belong to 2 groups. - - >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) - shape: (5, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ - │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - Dynamic group bys can also be combined with grouping on normal keys - - >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) - >>> df - shape: (7, 3) - ┌─────────────────────┬─────┬────────┐ - │ time ┆ n ┆ groups │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ str │ - ╞═════════════════════╪═════╪════════╡ - │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ - │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ - │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ - │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ - │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ - │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ - │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ - └─────────────────────┴─────┴────────┘ - >>> df.group_by_dynamic( - ... "time", - ... every="1h", - ... closed="both", - ... by="groups", - ... include_boundaries=True, - ... ).agg(pl.col("n")) - shape: (7, 5) - ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ - │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ - ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ - │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ - │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ - │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ - │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ - │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ - └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ - - Dynamic group by on an index column - - >>> df = pl.DataFrame( - ... { - ... "idx": pl.int_range(0, 6, eager=True), - ... "A": ["A", "A", "B", "B", "B", "C"], - ... } - ... ) - >>> ( - ... df.group_by_dynamic( - ... "idx", - ... every="2i", - ... period="3i", - ... include_boundaries=True, - ... closed="right", - ... ).agg(pl.col("A").alias("A_agg_list")) - ... ) - shape: (4, 4) - ┌─────────────────┬─────────────────┬─────┬─────────────────┐ - │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 ┆ list[str] │ - ╞═════════════════╪═════════════════╪═════╪═════════════════╡ - │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ - │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ - │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ - │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ - └─────────────────┴─────────────────┴─────┴─────────────────┘ - - ''' - def upsample(self, time_column: str) -> Self: - ''' - Upsample a DataFrame at a regular frequency. - - The `every` and `offset` arguments are created with - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - - - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - Parameters - ---------- - time_column - time column will be used to determine a date_range. - Note that this column has to be sorted for the output to make sense. - every - interval will start \'every\' duration - offset - change the start of the date_range by this offset. - by - First group by these columns and then upsample for every group - maintain_order - Keep the ordering predictable. This is slower. - - Returns - ------- - DataFrame - Result will be sorted by `time_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - Examples - -------- - Upsample a DataFrame by a certain interval. - - >>> from datetime import datetime - >>> df = pl.DataFrame( - ... { - ... "time": [ - ... datetime(2021, 2, 1), - ... datetime(2021, 4, 1), - ... datetime(2021, 5, 1), - ... datetime(2021, 6, 1), - ... ], - ... "groups": ["A", "B", "A", "B"], - ... "values": [0, 1, 2, 3], - ... } - ... ).set_sorted("time") - >>> df.upsample( - ... time_column="time", every="1mo", by="groups", maintain_order=True - ... ).select(pl.all().forward_fill()) - shape: (7, 3) - ┌─────────────────────┬────────┬────────┐ - │ time ┆ groups ┆ values │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ str ┆ i64 │ - ╞═════════════════════╪════════╪════════╡ - │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ - │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ - │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ - │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ - │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ - │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ - │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ - └─────────────────────┴────────┴────────┘ - - ''' - def join_asof(self, other: DataFrame) -> DataFrame: - ''' - Perform an asof join. - - This is similar to a left-join except that we match on nearest key rather than - equal keys. - - Both DataFrames must be sorted by the asof_join key. - - For each row in the left DataFrame: - - - A "backward" search selects the last row in the right DataFrame whose - \'on\' key is less than or equal to the left\'s key. - - - A "forward" search selects the first row in the right DataFrame whose - \'on\' key is greater than or equal to the left\'s key. - - - A "nearest" search selects the last row in the right DataFrame whose value - is nearest to the left\'s key. String keys are not currently supported for a - nearest search. - - The default is "backward". - - Parameters - ---------- - other - Lazy DataFrame to join with. - left_on - Join column of the left DataFrame. - right_on - Join column of the right DataFrame. - on - Join column of both DataFrames. If set, `left_on` and `right_on` should be - None. - by - join on these columns before doing asof join - by_left - join on these columns before doing asof join - by_right - join on these columns before doing asof join - strategy : {\'backward\', \'forward\', \'nearest\'} - Join strategy. - suffix - Suffix to append to columns with a duplicate name. - tolerance - Numeric tolerance. By setting this the join will only be done if the near - keys are within this distance. If an asof join is done on columns of dtype - "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta - object or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - allow_parallel - Allow the physical plan to optionally evaluate the computation of both - DataFrames up to the join in parallel. - force_parallel - Force the physical plan to evaluate the computation of both DataFrames up to - the join in parallel. - - Examples - -------- - >>> from datetime import datetime - >>> gdp = pl.DataFrame( - ... { - ... "date": [ - ... datetime(2016, 1, 1), - ... datetime(2017, 1, 1), - ... datetime(2018, 1, 1), - ... datetime(2019, 1, 1), - ... ], # note record date: Jan 1st (sorted!) - ... "gdp": [4164, 4411, 4566, 4696], - ... } - ... ).set_sorted("date") - >>> population = pl.DataFrame( - ... { - ... "date": [ - ... datetime(2016, 5, 12), - ... datetime(2017, 5, 12), - ... datetime(2018, 5, 12), - ... datetime(2019, 5, 12), - ... ], # note record date: May 12th (sorted!) - ... "population": [82.19, 82.66, 83.12, 83.52], - ... } - ... ).set_sorted("date") - >>> population.join_asof(gdp, on="date", strategy="backward") - shape: (4, 3) - ┌─────────────────────┬────────────┬──────┐ - │ date ┆ population ┆ gdp │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ f64 ┆ i64 │ - ╞═════════════════════╪════════════╪══════╡ - │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ - │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ - │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ - │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ - └─────────────────────┴────────────┴──────┘ - - ''' - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: - ''' - Join in SQL-like fashion. - - Parameters - ---------- - other - DataFrame to join with. - on - Name(s) of the join columns in both DataFrames. - how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} - Join strategy. - - .. note:: - A left join preserves the row order of the left DataFrame. - left_on - Name(s) of the left join column(s). - right_on - Name(s) of the right join column(s). - suffix - Suffix to append to columns with a duplicate name. - validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} - Checks if join is of specified type. - - * *many_to_many* - “m:m”: default, does not result in checks - * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets - * *one_to_many* - “1:m”: check if join keys are unique in left dataset - * *many_to_one* - “m:1”: check if join keys are unique in right dataset - - .. note:: - - - This is currently not supported the streaming engine. - - This is only supported when joined by single columns. - - Returns - ------- - DataFrame - - See Also - -------- - join_asof - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> other_df = pl.DataFrame( - ... { - ... "apple": ["x", "y", "z"], - ... "ham": ["a", "b", "d"], - ... } - ... ) - >>> df.join(other_df, on="ham") - shape: (2, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - └─────┴─────┴─────┴───────┘ - - >>> df.join(other_df, on="ham", how="outer") - shape: (4, 4) - ┌──────┬──────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞══════╪══════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ null ┆ null ┆ d ┆ z │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └──────┴──────┴─────┴───────┘ - - >>> df.join(other_df, on="ham", how="left") - shape: (3, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └─────┴─────┴─────┴───────┘ - - >>> df.join(other_df, on="ham", how="semi") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 7.0 ┆ b │ - └─────┴─────┴─────┘ - - >>> df.join(other_df, on="ham", how="anti") - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - Notes - ----- - For joining on columns with categorical data, see `pl.StringCache()`. - - ''' - def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: - ''' - Apply a custom/user-defined function (UDF) over the rows of the DataFrame. - - .. warning:: - This method is much slower than the native expressions API. - Only use it if you cannot implement your logic otherwise. - - The UDF will receive each row as a tuple of values: `udf(row)`. - - Implementing logic using a Python function is almost always *significantly* - slower and more memory intensive than implementing the same logic using - the native expression API because: - - - The native expression engine runs in Rust; UDFs run in Python. - - Use of Python UDFs forces the DataFrame to be materialized in memory. - - Polars-native expressions can be parallelised (UDFs typically cannot). - - Polars-native expressions can be logically optimised (UDFs cannot). - - Wherever possible you should strongly prefer the native expression API - to achieve the best performance. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output type of the operation. If none given, Polars tries to infer the type. - inference_size - Only used in the case when the custom function returns rows. - This uses the first `n` rows to determine the output schema. - - Notes - ----- - * The frame-level `apply` cannot track column names (as the UDF is a black-box - that may arbitrarily drop, rearrange, transform, or add new columns); if you - want to apply a UDF such that column names are preserved, you should use the - expression-level `apply` syntax instead. - - * If your function is expensive and you don\'t want it to be called more than - once for a given input, consider applying an `@lru_cache` decorator to it. - If your data is suitable you may achieve *significant* speedups. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) - - Return a DataFrame by mapping each row to a tuple: - - >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) - shape: (3, 2) - ┌──────────┬──────────┐ - │ column_0 ┆ column_1 │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════════╪══════════╡ - │ 2 ┆ -3 │ - │ 4 ┆ 15 │ - │ 6 ┆ 24 │ - └──────────┴──────────┘ - - However, it is much better to implement this with a native expression: - - >>> df.select( - ... pl.col("foo") * 2, - ... pl.col("bar") * 3, - ... ) # doctest: +IGNORE_RESULT - - Return a DataFrame with a single column by mapping each row to a scalar: - - >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP - shape: (3, 1) - ┌───────┐ - │ apply │ - │ --- │ - │ i64 │ - ╞═══════╡ - │ 1 │ - │ 9 │ - │ 14 │ - └───────┘ - - In this case it is better to use the following native expression: - - >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT - - ''' - def hstack(self, columns: list[Series] | DataFrame) -> Self: - ''' - Return a new DataFrame grown horizontally by stacking multiple Series to it. - - Parameters - ---------- - columns - Series to stack. - in_place - Modify in place. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> x = pl.Series("apple", [10, 20, 30]) - >>> df.hstack([x]) - shape: (3, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6 ┆ a ┆ 10 │ - │ 2 ┆ 7 ┆ b ┆ 20 │ - │ 3 ┆ 8 ┆ c ┆ 30 │ - └─────┴─────┴─────┴───────┘ - - ''' - def vstack(self, other: DataFrame) -> Self: - ''' - Grow this DataFrame vertically by stacking a DataFrame to it. - - Parameters - ---------- - other - DataFrame to stack. - in_place - Modify in place. - - See Also - -------- - extend - - Examples - -------- - >>> df1 = pl.DataFrame( - ... { - ... "foo": [1, 2], - ... "bar": [6, 7], - ... "ham": ["a", "b"], - ... } - ... ) - >>> df2 = pl.DataFrame( - ... { - ... "foo": [3, 4], - ... "bar": [8, 9], - ... "ham": ["c", "d"], - ... } - ... ) - >>> df1.vstack(df2) - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - │ 4 ┆ 9 ┆ d │ - └─────┴─────┴─────┘ - - ''' - def extend(self, other: DataFrame) -> Self: - ''' - Extend the memory backed by this `DataFrame` with the values from `other`. - - Different from `vstack` which adds the chunks from `other` to the chunks of - this `DataFrame`, `extend` appends the data from `other` to the underlying - memory locations and thus may cause a reallocation. - - If this does not cause a reallocation, the resulting data structure will not - have any extra chunks and thus will yield faster queries. - - Prefer `extend` over `vstack` when you want to do a query after a single - append. For instance, during online operations where you add `n` rows and rerun - a query. - - Prefer `vstack` over `extend` when you want to append many times before - doing a query. For instance, when you read in multiple files and want to store - them in a single `DataFrame`. In the latter case, finish the sequence of - `vstack` operations with a `rechunk`. - - Parameters - ---------- - other - DataFrame to vertically add. - - Warnings - -------- - This method modifies the dataframe in-place. The dataframe is returned for - convenience only. - - See Also - -------- - vstack - - Examples - -------- - >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) - >>> df1.extend(df2) - shape: (6, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 5 │ - │ 3 ┆ 6 │ - │ 10 ┆ 40 │ - │ 20 ┆ 50 │ - │ 30 ┆ 60 │ - └─────┴─────┘ - - ''' - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: - ''' - Remove columns from the dataframe. - - Parameters - ---------- - columns - Names of the columns that should be removed from the dataframe, or - a selector that determines the columns to drop. - *more_columns - Additional columns to drop, specified as positional arguments. - - Examples - -------- - Drop a single column by passing the name of that column. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.drop("ham") - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪═════╡ - │ 1 ┆ 6.0 │ - │ 2 ┆ 7.0 │ - │ 3 ┆ 8.0 │ - └─────┴─────┘ - - Drop multiple columns by passing a list of column names. - - >>> df.drop(["bar", "ham"]) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - Drop multiple columns by passing a selector. - - >>> import polars.selectors as cs - >>> df.drop(cs.numeric()) - shape: (3, 1) - ┌─────┐ - │ ham │ - │ --- │ - │ str │ - ╞═════╡ - │ a │ - │ b │ - │ c │ - └─────┘ - - Use positional arguments to drop multiple columns. - - >>> df.drop("foo", "ham") - shape: (3, 1) - ┌─────┐ - │ bar │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 6.0 │ - │ 7.0 │ - │ 8.0 │ - └─────┘ - - ''' - def drop_in_place(self, name: str) -> Series: - ''' - Drop a single column in-place and return the dropped column. - - Parameters - ---------- - name - Name of the column to drop. - - Returns - ------- - Series - The dropped column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.drop_in_place("ham") - shape: (3,) - Series: \'ham\' [str] - [ - "a" - "b" - "c" - ] - - ''' - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: - ''' - Cast DataFrame column(s) to the specified dtype(s). - - Parameters - ---------- - dtypes - Mapping of column names (or selector) to dtypes, or a single dtype - to which all columns will be cast. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], - ... } - ... ) - - Cast specific frame columns to the specified dtypes: - - >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ u8 ┆ date │ - ╞═════╪═════╪════════════╡ - │ 1.0 ┆ 6 ┆ 2020-01-02 │ - │ 2.0 ┆ 7 ┆ 2021-03-04 │ - │ 3.0 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - Cast all frame columns to the specified dtype: - - >>> df.cast(pl.Utf8).to_dict(as_series=False) - {\'foo\': [\'1\', \'2\', \'3\'], - \'bar\': [\'6.0\', \'7.0\', \'8.0\'], - \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} - - Use selectors to define the columns being cast: - - >>> import polars.selectors as cs - >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ str │ - ╞═════╪═════╪════════════╡ - │ 1 ┆ 6 ┆ 2020-01-02 │ - │ 2 ┆ 7 ┆ 2021-03-04 │ - │ 3 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - ''' - def clear(self, n: int = ...) -> Self: - ''' - Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. - - Returns a `n`-row null-filled DataFrame with an identical schema. - `n` can be greater than the current number of rows in the DataFrame. - - Parameters - ---------- - n - Number of (null-filled) rows to return in the cleared frame. - - See Also - -------- - clone : Cheap deepcopy/clone. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> df.clear() - shape: (0, 3) - ┌─────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞═════╪═════╪══════╡ - └─────┴─────┴──────┘ - - >>> df.clear(n=2) - shape: (2, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪══════╪══════╡ - │ null ┆ null ┆ null │ - │ null ┆ null ┆ null │ - └──────┴──────┴──────┘ - - ''' - def clone(self) -> Self: - ''' - Create a copy of this DataFrame. - - This is a cheap operation that does not copy data. - - See Also - -------- - clear : Create an empty copy of the current DataFrame, with identical - schema but no data. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.clone() - shape: (4, 3) - ┌─────┬──────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true │ - │ 2 ┆ 4.0 ┆ true │ - │ 3 ┆ 10.0 ┆ false │ - │ 4 ┆ 13.0 ┆ true │ - └─────┴──────┴───────┘ - - ''' - def get_columns(self) -> list[Series]: - ''' - Get the DataFrame as a List of Series. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.get_columns() - [shape: (3,) - Series: \'foo\' [i64] - [ - 1 - 2 - 3 - ], shape: (3,) - Series: \'bar\' [i64] - [ - 4 - 5 - 6 - ]] - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.get_columns() - [shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - ], shape: (4,) - Series: \'b\' [f64] - [ - 0.5 - 4.0 - 10.0 - 13.0 - ], shape: (4,) - Series: \'c\' [bool] - [ - true - true - false - true - ]] - - ''' - def get_column(self, name: str) -> Series: - ''' - Get a single column by name. - - Parameters - ---------- - name : str - Name of the column to retrieve. - - Returns - ------- - Series - - See Also - -------- - to_series - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.get_column("foo") - shape: (3,) - Series: \'foo\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: - ''' - Fill null values using the specified value or strategy. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - matches_supertype - Fill all matching supertype of the fill `value`. - - Returns - ------- - DataFrame - DataFrame with None values replaced by the filling strategy. - - See Also - -------- - fill_nan - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 4], - ... "b": [0.5, 4, None, 13], - ... } - ... ) - >>> df.fill_null(99) - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 99 ┆ 99.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - >>> df.fill_null(strategy="forward") - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> df.fill_null(strategy="max") - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> df.fill_null(strategy="zero") - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 0 ┆ 0.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - ''' - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: - ''' - Fill floating point NaN values by an Expression evaluation. - - Parameters - ---------- - value - Value with which to replace NaN values. - - Returns - ------- - DataFrame - DataFrame with NaN values replaced by the given value. - - Warnings - -------- - Note that floating point NaNs (Not a Number) are not missing values! - To replace missing values, use :func:`fill_null`. - - See Also - -------- - fill_null - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1.5, 2, float("nan"), 4], - ... "b": [0.5, 4, float("nan"), 13], - ... } - ... ) - >>> df.fill_nan(99) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪══════╡ - │ 1.5 ┆ 0.5 │ - │ 2.0 ┆ 4.0 │ - │ 99.0 ┆ 99.0 │ - │ 4.0 ┆ 13.0 │ - └──────┴──────┘ - - ''' - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: - ''' - Explode the dataframe to long format by exploding the given columns. - - Parameters - ---------- - columns - Column names, expressions, or a selector defining them. The underlying - columns being exploded must be of List or Utf8 datatype. - *more_columns - Additional names of columns to explode, specified as positional arguments. - - Returns - ------- - DataFrame - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "letters": ["a", "a", "b", "c"], - ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], - ... } - ... ) - >>> df - shape: (4, 2) - ┌─────────┬───────────┐ - │ letters ┆ numbers │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════════╪═══════════╡ - │ a ┆ [1] │ - │ a ┆ [2, 3] │ - │ b ┆ [4, 5] │ - │ c ┆ [6, 7, 8] │ - └─────────┴───────────┘ - >>> df.explode("numbers") - shape: (8, 2) - ┌─────────┬─────────┐ - │ letters ┆ numbers │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════════╪═════════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ a ┆ 3 │ - │ b ┆ 4 │ - │ b ┆ 5 │ - │ c ┆ 6 │ - │ c ┆ 7 │ - │ c ┆ 8 │ - └─────────┴─────────┘ - - ''' - def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: - ''' - Create a spreadsheet-style pivot table as a DataFrame. - - Only available in eager mode. See "Examples" section below for how to do a - "lazy pivot" if you know the unique column values in advance. - - Parameters - ---------- - values - Column values to aggregate. Can be multiple columns if the *columns* - arguments contains multiple columns as well. - index - One or multiple keys to group by. - columns - Name of the column(s) whose values will be used as the header of the output - DataFrame. - aggregate_function - Choose from: - - - None: no aggregation takes place, will raise error if multiple values are in group. - - A predefined aggregate function string, one of - {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} - - An expression to do the aggregation. - - maintain_order - Sort the grouped keys so that the output order is predictable. - sort_columns - Sort the transposed columns by name. Default is by order of discovery. - separator - Used as separator/delimiter in generated column names. - - Returns - ------- - DataFrame - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": ["one", "one", "two", "two", "one", "two"], - ... "bar": ["y", "y", "y", "x", "x", "x"], - ... "baz": [1, 2, 3, 4, 5, 6], - ... } - ... ) - >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ y ┆ x │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ one ┆ 3 ┆ 5 │ - │ two ┆ 3 ┆ 10 │ - └─────┴─────┴─────┘ - - Pivot using selectors to determine the index/values/columns: - - >>> import polars.selectors as cs - >>> df.pivot( - ... values=cs.numeric(), - ... index=cs.string(), - ... columns=cs.string(), - ... aggregate_function="sum", - ... sort_columns=True, - ... ).sort( - ... by=cs.string(), - ... ) - shape: (4, 6) - ┌─────┬─────┬──────┬──────┬──────┬──────┐ - │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪══════╪══════╪══════╪══════╡ - │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ - │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ - │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ - │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ - └─────┴─────┴──────┴──────┴──────┴──────┘ - - Run an expression as aggregation function - - >>> df = pl.DataFrame( - ... { - ... "col1": ["a", "a", "a", "b", "b", "b"], - ... "col2": ["x", "x", "x", "x", "y", "y"], - ... "col3": [6, 7, 3, 2, 5, 7], - ... } - ... ) - >>> df.pivot( - ... index="col1", - ... columns="col2", - ... values="col3", - ... aggregate_function=pl.element().tanh().mean(), - ... ) - shape: (2, 3) - ┌──────┬──────────┬──────────┐ - │ col1 ┆ x ┆ y │ - │ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 │ - ╞══════╪══════════╪══════════╡ - │ a ┆ 0.998347 ┆ null │ - │ b ┆ 0.964028 ┆ 0.999954 │ - └──────┴──────────┴──────────┘ - - Note that `pivot` is only available in eager mode. If you know the unique - column values in advance, you can use :meth:`polars.LazyFrame.groupby` to - get the same result as above in lazy mode: - - >>> index = pl.col("col1") - >>> columns = pl.col("col2") - >>> values = pl.col("col3") - >>> unique_column_values = ["x", "y"] - >>> aggregate_function = lambda col: col.tanh().mean() - >>> ( - ... df.lazy() - ... .group_by(index) - ... .agg( - ... *[ - ... aggregate_function(values.filter(columns == value)).alias(value) - ... for value in unique_column_values - ... ] - ... ) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - shape: (2, 3) - ┌──────┬──────────┬──────────┐ - │ col1 ┆ x ┆ y │ - │ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 │ - ╞══════╪══════════╪══════════╡ - │ a ┆ 0.998347 ┆ null │ - │ b ┆ 0.964028 ┆ 0.999954 │ - └──────┴──────────┴──────────┘ - - ''' - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: - ''' - Unpivot a DataFrame from wide to long format. - - Optionally leaves identifiers set. - - This function is useful to massage a DataFrame into a format where one or more - columns are identifier variables (id_vars) while all other columns, considered - measured variables (value_vars), are "unpivoted" to the row axis leaving just - two non-identifier columns, \'variable\' and \'value\'. - - Parameters - ---------- - id_vars - Column(s) or selector(s) to use as identifier variables. - value_vars - Column(s) or selector(s) to use as values variables; if `value_vars` - is empty all columns that are not in `id_vars` will be used. - variable_name - Name to give to the `variable` column. Defaults to "variable" - value_name - Name to give to the `value` column. Defaults to "value" - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["x", "y", "z"], - ... "b": [1, 3, 5], - ... "c": [2, 4, 6], - ... } - ... ) - >>> import polars.selectors as cs - >>> df.melt(id_vars="a", value_vars=cs.numeric()) - shape: (6, 3) - ┌─────┬──────────┬───────┐ - │ a ┆ variable ┆ value │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 │ - ╞═════╪══════════╪═══════╡ - │ x ┆ b ┆ 1 │ - │ y ┆ b ┆ 3 │ - │ z ┆ b ┆ 5 │ - │ x ┆ c ┆ 2 │ - │ y ┆ c ┆ 4 │ - │ z ┆ c ┆ 6 │ - └─────┴──────────┴───────┘ - - ''' - def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: - ''' - Unstack a long table to a wide form without doing an aggregation. - - This can be much faster than a pivot, because it can skip the grouping phase. - - Warnings - -------- - This functionality is experimental and may be subject to changes - without it being considered a breaking change. - - Parameters - ---------- - step - Number of rows in the unstacked frame. - how : { \'vertical\', \'horizontal\' } - Direction of the unstack. - columns - Column name(s) or selector(s) to include in the operation. - If set to `None` (default), use all columns. - fill_values - Fill values that don\'t fit the new size with this value. - - Examples - -------- - >>> from string import ascii_uppercase - >>> df = pl.DataFrame( - ... { - ... "x": list(ascii_uppercase[0:8]), - ... "y": pl.int_range(1, 9, eager=True), - ... } - ... ).with_columns( - ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), - ... ) - >>> df - shape: (8, 3) - ┌─────┬─────┬──────────┐ - │ x ┆ y ┆ z │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ list[u8] │ - ╞═════╪═════╪══════════╡ - │ A ┆ 1 ┆ [1, 2] │ - │ B ┆ 2 ┆ [2, 3] │ - │ C ┆ 3 ┆ [3, 4] │ - │ D ┆ 4 ┆ [4, 5] │ - │ E ┆ 5 ┆ [5, 6] │ - │ F ┆ 6 ┆ [6, 7] │ - │ G ┆ 7 ┆ [7, 8] │ - │ H ┆ 8 ┆ [8, 9] │ - └─────┴─────┴──────────┘ - >>> df.unstack(step=4, how="vertical") - shape: (4, 6) - ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ - │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ - ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ - │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ - │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ - │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ - │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ - └─────┴─────┴─────┴─────┴──────────┴──────────┘ - >>> df.unstack(step=2, how="horizontal") - shape: (4, 6) - ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ - │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ - ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ - │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ - │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ - │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ - │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ - └─────┴─────┴─────┴─────┴──────────┴──────────┘ - >>> import polars.selectors as cs - >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) - shape: (5, 2) - ┌─────┬─────┐ - │ y_0 ┆ y_1 │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 6 │ - │ 2 ┆ 7 │ - │ 3 ┆ 8 │ - │ 4 ┆ 0 │ - │ 5 ┆ 0 │ - └─────┴─────┘ - - ''' - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: - ''' - Group by the given columns and return the groups as separate dataframes. - - Parameters - ---------- - by - Column name(s) or selector(s) to group by. - *more_by - Additional names of columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default partition by operation. - include_key - Include the columns used to partition the DataFrame in the output. - as_dict - Return a dictionary instead of a list. The dictionary keys are the distinct - group values that identify that group. - - Examples - -------- - Pass a single column name to partition by that column. - - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "c"], - ... "b": [1, 2, 1, 3, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> df.partition_by("a") # doctest: +IGNORE_RESULT - [shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘, - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘] - - Partition by multiple columns by either passing a list of column names, or by - specifying each column name as a positional argument. - - >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT - [shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘] - - Return the partitions as a dictionary by specifying `as_dict=True`. - - >>> import polars.selectors as cs - >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT - {\'a\': shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘, - \'b\': shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘, - \'c\': shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘} - - ''' - def shift(self, n: int = ...) -> DataFrame: - ''' - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. Accepts expression input. - Non-expression inputs are parsed as literals. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [5, 6, 7, 8], - ... } - ... ) - >>> df.shift() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ null ┆ null │ - │ 1 ┆ 5 │ - │ 2 ┆ 6 │ - │ 3 ┆ 7 │ - └──────┴──────┘ - - Pass a negative value to shift in the opposite direction instead. - - >>> df.shift(-2) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ null ┆ null │ - │ null ┆ null │ - └──────┴──────┘ - - Specify `fill_value` to fill the resulting null values. - - >>> df.shift(-2, fill_value=100) - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ 100 ┆ 100 │ - │ 100 ┆ 100 │ - └─────┴─────┘ - - ''' - def is_duplicated(self) -> Series: - ''' - Get a mask of all duplicated rows in this DataFrame. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) - >>> df.is_duplicated() - shape: (4,) - Series: \'\' [bool] - [ - true - false - false - true - ] - - This mask can be used to visualize the duplicated lines like this: - - >>> df.filter(df.is_duplicated()) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ str │ - ╞═════╪═════╡ - │ 1 ┆ x │ - │ 1 ┆ x │ - └─────┴─────┘ - ''' - def is_unique(self) -> Series: - ''' - Get a mask of all unique rows in this DataFrame. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) - >>> df.is_unique() - shape: (4,) - Series: \'\' [bool] - [ - false - true - true - false - ] - - This mask can be used to visualize the unique lines like this: - - >>> df.filter(df.is_unique()) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ str │ - ╞═════╪═════╡ - │ 2 ┆ y │ - │ 3 ┆ z │ - └─────┴─────┘ - ''' - def lazy(self) -> LazyFrame: - ''' - Start a lazy query from this point. This returns a `LazyFrame` object. - - Operations on a `LazyFrame` are not executed until this is requested by either - calling: - - * :meth:`.fetch() ` - (run on a small number of rows) - * :meth:`.collect() ` - (run on all data) - * :meth:`.describe_plan() ` - (print unoptimized query plan) - * :meth:`.describe_optimized_plan() ` - (print optimized query plan) - * :meth:`.show_graph() ` - (show (un)optimized query plan as graphviz graph) - - Lazy operations are advised because they allow for query optimization and more - parallelization. - - Returns - ------- - LazyFrame - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> df.lazy() # doctest: +ELLIPSIS - - - ''' - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - ''' - Select columns from this DataFrame. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Examples - -------- - Pass the name of a column to select that column. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.select("foo") - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - Multiple columns can be selected by passing a list of column names. - - >>> df.select(["foo", "bar"]) - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 6 │ - │ 2 ┆ 7 │ - │ 3 ┆ 8 │ - └─────┴─────┘ - - Multiple columns can also be selected using positional arguments instead of a - list. Expressions are also accepted. - - >>> df.select(pl.col("foo"), pl.col("bar") + 1) - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - └─────┴─────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) - shape: (3, 1) - ┌───────────┐ - │ threshold │ - │ --- │ - │ i32 │ - ╞═══════════╡ - │ 0 │ - │ 0 │ - │ 10 │ - └───────────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... df.select( - ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), - ... ) - ... - shape: (3, 1) - ┌───────────┐ - │ is_odd │ - │ --- │ - │ struct[2] │ - ╞═══════════╡ - │ {1,0} │ - │ {0,1} │ - │ {1,0} │ - └───────────┘ - - ''' - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - """ - Select columns from this LazyFrame. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - See Also - -------- - select - - """ - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - ''' - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - DataFrame - A new DataFrame with the columns added. - - Notes - ----- - Creating a new DataFrame using this method does not create a new copy of - existing data. - - Examples - -------- - Pass an expression to add it as a new column. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) - shape: (4, 4) - ┌─────┬──────┬───────┬──────┐ - │ a ┆ b ┆ c ┆ a^2 │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 │ - ╞═════╪══════╪═══════╪══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ - └─────┴──────┴───────┴──────┘ - - Added columns will replace existing columns with the same name. - - >>> df.with_columns(pl.col("a").cast(pl.Float64)) - shape: (4, 3) - ┌─────┬──────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╡ - │ 1.0 ┆ 0.5 ┆ true │ - │ 2.0 ┆ 4.0 ┆ true │ - │ 3.0 ┆ 10.0 ┆ false │ - │ 4.0 ┆ 13.0 ┆ true │ - └─────┴──────┴───────┘ - - Multiple columns can be added by passing a list of expressions. - - >>> df.with_columns( - ... [ - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ] - ... ) - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Multiple columns also can be added using positional arguments instead of a list. - - >>> df.with_columns( - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ) - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> df.with_columns( - ... ab=pl.col("a") * pl.col("b"), - ... not_c=pl.col("c").not_(), - ... ) - shape: (4, 5) - ┌─────┬──────┬───────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ ab ┆ not_c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ - └─────┴──────┴───────┴──────┴───────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... df.drop("c").with_columns( - ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), - ... ) - ... - shape: (4, 3) - ┌─────┬──────┬─────────────┐ - │ a ┆ b ┆ diffs │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ struct[2] │ - ╞═════╪══════╪═════════════╡ - │ 1 ┆ 0.5 ┆ {null,null} │ - │ 2 ┆ 4.0 ┆ {1,3.5} │ - │ 3 ┆ 10.0 ┆ {1,6.0} │ - │ 4 ┆ 13.0 ┆ {1,3.0} │ - └─────┴──────┴─────────────┘ - - ''' - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - """ - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - LazyFrame - A new LazyFrame with the columns added. - - See Also - -------- - with_columns - - """ - def n_chunks(self, strategy: str = ...) -> int | list[int]: - ''' - Get number of chunks used by the ChunkedArrays of this DataFrame. - - Parameters - ---------- - strategy : {\'first\', \'all\'} - Return the number of chunks of the \'first\' column, - or \'all\' columns in this DataFrame. - - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.n_chunks() - 1 - >>> df.n_chunks(strategy="all") - [1, 1, 1] - - ''' - def max(self, axis: int | None = ...) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their maximum value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`max_horizontal`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.max() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def max_horizontal(self) -> Series: - ''' - Get the maximum value horizontally across columns. - - Returns - ------- - Series - A Series named `"max"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.max_horizontal() - shape: (3,) - Series: \'max\' [f64] - [ - 4.0 - 5.0 - 6.0 - ] - ''' - def min(self, axis: int | None = ...) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their minimum value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`min_horizontal`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.min() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - ''' - def min_horizontal(self) -> Series: - ''' - Get the minimum value horizontally across columns. - - Returns - ------- - Series - A Series named `"min"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.min_horizontal() - shape: (3,) - Series: \'min\' [f64] - [ - 1.0 - 2.0 - 3.0 - ] - ''' - def sum(self) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their sum value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`sum_horizontal`. - null_strategy : {\'ignore\', \'propagate\'} - This argument is only used if `axis == 1`. - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.sum() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 6 ┆ 21 ┆ null │ - └─────┴─────┴──────┘ - ''' - def sum_horizontal(self) -> Series: - ''' - Sum all values horizontally across columns. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - If set to `False`, any null value in the input will lead to a null output. - - Returns - ------- - Series - A Series named `"sum"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.sum_horizontal() - shape: (3,) - Series: \'sum\' [f64] - [ - 5.0 - 7.0 - 9.0 - ] - ''' - def mean(self) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their mean value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`mean_horizontal`. - null_strategy : {\'ignore\', \'propagate\'} - This argument is only used if `axis == 1`. - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... "spam": [True, False, None], - ... } - ... ) - >>> df.mean() - shape: (1, 4) - ┌─────┬─────┬──────┬──────┐ - │ foo ┆ bar ┆ ham ┆ spam │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str ┆ f64 │ - ╞═════╪═════╪══════╪══════╡ - │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ - └─────┴─────┴──────┴──────┘ - ''' - def mean_horizontal(self) -> Series: - ''' - Take the mean of all values horizontally across columns. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - If set to `False`, any null value in the input will lead to a null output. - - Returns - ------- - Series - A Series named `"mean"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.mean_horizontal() - shape: (3,) - Series: \'mean\' [f64] - [ - 2.5 - 3.5 - 4.5 - ] - ''' - def std(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns of this DataFrame to their standard deviation value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.std() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1.0 ┆ 1.0 ┆ null │ - └─────┴─────┴──────┘ - >>> df.std(ddof=0) - shape: (1, 3) - ┌──────────┬──────────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞══════════╪══════════╪══════╡ - │ 0.816497 ┆ 0.816497 ┆ null │ - └──────────┴──────────┴──────┘ - - ''' - def var(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns of this DataFrame to their variance value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.var() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1.0 ┆ 1.0 ┆ null │ - └─────┴─────┴──────┘ - >>> df.var(ddof=0) - shape: (1, 3) - ┌──────────┬──────────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞══════════╪══════════╪══════╡ - │ 0.666667 ┆ 0.666667 ┆ null │ - └──────────┴──────────┴──────┘ - - ''' - def median(self) -> Self: - ''' - Aggregate the columns of this DataFrame to their median value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.median() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 2.0 ┆ 7.0 ┆ null │ - └─────┴─────┴──────┘ - - ''' - def product(self) -> DataFrame: - ''' - Aggregate the columns of this DataFrame to their product values. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": [0.5, 4, 10], - ... "c": [True, True, False], - ... } - ... ) - - >>> df.product() - shape: (1, 3) - ┌─────┬──────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ i64 │ - ╞═════╪══════╪═════╡ - │ 6 ┆ 20.0 ┆ 0 │ - └─────┴──────┴─────┘ - - ''' - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: - ''' - Aggregate the columns of this DataFrame to their quantile value. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.quantile(0.5, "nearest") - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 2.0 ┆ 7.0 ┆ null │ - └─────┴─────┴──────┘ - - ''' - def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: - ''' - Convert categorical variables into dummy/indicator variables. - - Parameters - ---------- - columns - Column name(s) or selector(s) that should be converted to dummy - variables. If set to `None` (default), convert all columns. - separator - Separator/delimiter used when generating column names. - drop_first - Remove the first category from the variables being encoded. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2], - ... "bar": [3, 4], - ... "ham": ["a", "b"], - ... } - ... ) - >>> df.to_dummies() - shape: (2, 6) - ┌───────┬───────┬───────┬───────┬───────┬───────┐ - │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ - ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ - │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ - │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ - └───────┴───────┴───────┴───────┴───────┴───────┘ - - >>> df.to_dummies(drop_first=True) - shape: (2, 3) - ┌───────┬───────┬───────┐ - │ foo_2 ┆ bar_4 ┆ ham_b │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 │ - ╞═══════╪═══════╪═══════╡ - │ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1 ┆ 1 │ - └───────┴───────┴───────┘ - - >>> import polars.selectors as cs - >>> df.to_dummies(cs.integer(), separator=":") - shape: (2, 5) - ┌───────┬───────┬───────┬───────┬─────┐ - │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ - ╞═══════╪═══════╪═══════╪═══════╪═════╡ - │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ - │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ - └───────┴───────┴───────┴───────┴─────┘ - - >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") - shape: (2, 3) - ┌───────┬───────┬─────┐ - │ foo:2 ┆ bar:4 ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ str │ - ╞═══════╪═══════╪═════╡ - │ 0 ┆ 0 ┆ a │ - │ 1 ┆ 1 ┆ b │ - └───────┴───────┴─────┘ - - ''' - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: - ''' - Drop duplicate rows from this dataframe. - - Parameters - ---------- - subset - Column name(s) or selector(s), to consider when identifying - duplicate rows. If set to `None` (default), use all columns. - keep : {\'first\', \'last\', \'any\', \'none\'} - Which of the duplicate rows to keep. - - * \'any\': Does not give any guarantee of which row is kept. - This allows more optimizations. - * \'none\': Don\'t keep duplicate rows. - * \'first\': Keep first unique row. - * \'last\': Keep last unique row. - maintain_order - Keep the same order as the original DataFrame. This is more expensive to - compute. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - Returns - ------- - DataFrame - DataFrame with unique rows. - - Warnings - -------- - This method will fail if there is a column of type `List` in the DataFrame or - subset. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 1], - ... "bar": ["a", "a", "a", "a"], - ... "ham": ["b", "b", "b", "b"], - ... } - ... ) - >>> df.unique(maintain_order=True) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> df.unique(subset=["bar", "ham"], maintain_order=True) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> df.unique(keep="last", maintain_order=True) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - - ''' - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: - ''' - Return the number of unique rows, or the number of unique row-subsets. - - Parameters - ---------- - subset - One or more columns/expressions that define what to count; - omit to return the count of unique rows. - - Notes - ----- - This method operates at the `DataFrame` level; to operate on subsets at the - expression level you can make use of struct-packing instead, for example: - - >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() - - If instead you want to count the number of unique values per-column, you can - also use expression-level syntax to return a new frame containing that result: - - >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) - >>> df_nunique = df.select(pl.all().n_unique()) - - In aggregate context there is also an equivalent method for returning the - unique values per-group: - - >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 1, 2, 3, 4, 5], - ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], - ... "c": [True, True, True, False, True, True], - ... } - ... ) - >>> df.n_unique() - 5 - - Simple columns subset. - - >>> df.n_unique(subset=["b", "c"]) - 4 - - Expression subset. - - >>> df.n_unique( - ... subset=[ - ... (pl.col("a") // 2), - ... (pl.col("c") | (pl.col("b") >= 2)), - ... ], - ... ) - 3 - - ''' - def approx_n_unique(self) -> DataFrame: - ''' - Approximate count of unique values. - - This is done using the HyperLogLog++ algorithm for cardinality estimation. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> df.approx_n_unique() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def approx_unique(self) -> DataFrame: - """ - Approximate count of unique values. - - .. deprecated:: 0.18.12 - This method has been renamed to :func:`DataFrame.approx_n_unique`. - - """ - def rechunk(self) -> Self: - """ - Rechunk the data in this DataFrame to a contiguous allocation. - - This will make sure all subsequent operations have optimal and predictable - performance. - """ - def null_count(self) -> Self: - ''' - Create a new DataFrame that shows the null counts per column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, None, 3], - ... "bar": [6, 7, None], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.null_count() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ u32 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 1 ┆ 0 │ - └─────┴─────┴─────┘ - - ''' - def sample(self, n: int | Series | None = ...) -> Self: - ''' - Sample from this DataFrame. - - Parameters - ---------- - n - Number of items to return. Cannot be used with `fraction`. Defaults to 1 if - `fraction` is None. - fraction - Fraction of items to return. Cannot be used with `n`. - with_replacement - Allow values to be sampled more than once. - shuffle - If set to True, the order of the sampled rows will be shuffled. If - set to False (default), the order of the returned rows will be - neither stable nor fully random. - seed - Seed for the random number generator. If set to None (default), a - random seed is generated for each sample operation. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8 ┆ c │ - │ 2 ┆ 7 ┆ b │ - └─────┴─────┴─────┘ - - ''' - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: - ''' - Apply a horizontal reduction on a DataFrame. - - This can be used to effectively determine aggregations on a row level, and can - be applied to any DataType that can be supercasted (casted to a similar parent - type). - - An example of the supercast rules when applying an arithmetic operation on two - DataTypes are for instance: - - - Int8 + Utf8 = Utf8 - - Float32 + Int64 = Float32 - - Float32 + Float64 = Float64 - - Examples - -------- - A horizontal sum operation: - - >>> df = pl.DataFrame( - ... { - ... "a": [2, 1, 3], - ... "b": [1, 2, 3], - ... "c": [1.0, 2.0, 3.0], - ... } - ... ) - >>> df.fold(lambda s1, s2: s1 + s2) - shape: (3,) - Series: \'a\' [f64] - [ - 4.0 - 5.0 - 9.0 - ] - - A horizontal minimum operation: - - >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) - >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) - shape: (3,) - Series: \'a\' [f64] - [ - 1.0 - 1.0 - 3.0 - ] - - A horizontal string concatenation: - - >>> df = pl.DataFrame( - ... { - ... "a": ["foo", "bar", 2], - ... "b": [1, 2, 3], - ... "c": [1.0, 2.0, 3.0], - ... } - ... ) - >>> df.fold(lambda s1, s2: s1 + s2) - shape: (3,) - Series: \'a\' [str] - [ - "foo11.0" - "bar22.0" - null - ] - - A horizontal boolean or, similar to a row-wise .any(): - - >>> df = pl.DataFrame( - ... { - ... "a": [False, False, True], - ... "b": [False, True, False], - ... } - ... ) - >>> df.fold(lambda s1, s2: s1 | s2) - shape: (3,) - Series: \'a\' [bool] - [ - false - true - true - ] - - Parameters - ---------- - operation - function that takes two `Series` and returns a `Series`. - - ''' - def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: - ''' - Get the values of a single row, either by index or by predicate. - - Parameters - ---------- - index - Row index. - by_predicate - Select the row according to a given expression/predicate. - named - Return a dictionary instead of a tuple. The dictionary is a mapping of - column name to row value. This is more expensive than returning a regular - tuple, but allows for accessing values by column name. - - Returns - ------- - tuple (default) or dictionary of row values - - Notes - ----- - The `index` and `by_predicate` params are mutually exclusive. Additionally, - to ensure clarity, the `by_predicate` parameter must be supplied by keyword. - - When using `by_predicate` it is an error condition if anything other than - one row is returned; more than one row raises `TooManyRowsReturnedError`, and - zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). - - Warnings - -------- - You should NEVER use this method to iterate over a DataFrame; if you require - row-iteration you should strongly prefer use of `iter_rows()` instead. - - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - rows : Materialise all frame data as a list of rows (potentially expensive). - item: Return dataframe element as a scalar. - - Examples - -------- - Specify an index to return the row at the given index as a tuple. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.row(2) - (3, 8, \'c\') - - Specify `named=True` to get a dictionary instead with a mapping of column - names to row values. - - >>> df.row(2, named=True) - {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} - - Use `by_predicate` to return the row that matches the given predicate. - - >>> df.row(by_predicate=(pl.col("ham") == "b")) - (2, 7, \'b\') - - ''' - def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: - ''' - Returns all data in the DataFrame as a list of rows of python-native values. - - Parameters - ---------- - named - Return dictionaries instead of tuples. The dictionaries are a mapping of - column name to row value. This is more expensive than returning a regular - tuple, but allows for accessing values by column name. - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - Warnings - -------- - Row-iteration is not optimal as the underlying data is stored in columnar form; - where possible, prefer export via one of the dedicated export/output methods. - Where possible you should also consider using `iter_rows` instead to avoid - materialising all the data at once. - - Returns - ------- - list of tuples (default) or dictionaries of row values - - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - rows_by_key : Materialises frame data as a key-indexed dictionary. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "x": ["a", "b", "b", "a"], - ... "y": [1, 2, 3, 4], - ... "z": [0, 3, 6, 9], - ... } - ... ) - >>> df.rows() - [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] - >>> df.rows(named=True) - [{\'x\': \'a\', \'y\': 1, \'z\': 0}, - {\'x\': \'b\', \'y\': 2, \'z\': 3}, - {\'x\': \'b\', \'y\': 3, \'z\': 6}, - {\'x\': \'a\', \'y\': 4, \'z\': 9}] - - ''' - def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: - ''' - Returns DataFrame data as a keyed dictionary of python-native values. - - Note that this method should not be used in place of native operations, due to - the high cost of materialising all frame data out into a dictionary; it should - be used only when you need to move the values out into a Python data structure - or other object that cannot operate directly with Polars/Arrow. - - Parameters - ---------- - key - The column(s) to use as the key for the returned dictionary. If multiple - columns are specified, the key will be a tuple of those values, otherwise - it will be a string. - named - Return dictionary rows instead of tuples, mapping column name to row value. - include_key - Include key values inline with the associated data (by default the key - values are omitted as a memory/performance optimisation, as they can be - reoconstructed from the key). - unique - Indicate that the key is unique; this will result in a 1:1 mapping from - key to a single associated row. Note that if the key is *not* actually - unique the last row with the given key will be returned. - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - See Also - -------- - rows : Materialise all frame data as a list of rows (potentially expensive). - iter_rows : Row iterator over frame data (does not materialise all rows). - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "w": ["a", "b", "b", "a"], - ... "x": ["q", "q", "q", "k"], - ... "y": [1.0, 2.5, 3.0, 4.5], - ... "z": [9, 8, 7, 6], - ... } - ... ) - - Group rows by the given key column(s): - - >>> df.rows_by_key(key=["w"]) - defaultdict(, - {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], - \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) - - Return the same row groupings as dictionaries: - - >>> df.rows_by_key(key=["w"], named=True) - defaultdict(, - {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, - {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], - \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, - {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) - - Return row groupings, assuming keys are unique: - - >>> df.rows_by_key(key=["z"], unique=True) - {9: (\'a\', \'q\', 1.0), - 8: (\'b\', \'q\', 2.5), - 7: (\'b\', \'q\', 3.0), - 6: (\'a\', \'k\', 4.5)} - - Return row groupings as dictionaries, assuming keys are unique: - - >>> df.rows_by_key(key=["z"], named=True, unique=True) - {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, - 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, - 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, - 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} - - Return dictionary rows grouped by a compound key, including key values: - - >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) - defaultdict(, - {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], - (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, - {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], - (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) - - ''' - def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: - ''' - Returns an iterator over the DataFrame of rows of python-native values. - - Parameters - ---------- - named - Return dictionaries instead of tuples. The dictionaries are a mapping of - column name to row value. This is more expensive than returning a regular - tuple, but allows for accessing values by column name. - buffer_size - Determines the number of rows that are buffered internally while iterating - over the data; you should only modify this in very specific cases where the - default value is determined not to be a good fit to your access pattern, as - the speedup from using the buffer is significant (~2-4x). Setting this - value to zero disables row buffering (not recommended). - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - Warnings - -------- - Row iteration is not optimal as the underlying data is stored in columnar form; - where possible, prefer export via one of the dedicated export/output methods - that deals with columnar data. - - Returns - ------- - iterator of tuples (default) or dictionaries (if named) of python row values - - See Also - -------- - rows : Materialises all frame data as a list of rows (potentially expensive). - rows_by_key : Materialises frame data as a key-indexed dictionary. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> [row[0] for row in df.iter_rows()] - [1, 3, 5] - >>> [row["b"] for row in df.iter_rows(named=True)] - [2, 4, 6] - - ''' - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: - ''' - Returns a non-copying iterator of slices over the underlying DataFrame. - - Parameters - ---------- - n_rows - Determines the number of rows contained in each DataFrame slice. - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... data={ - ... "a": range(17_500), - ... "b": date(2023, 1, 1), - ... "c": "klmnoopqrstuvwxyz", - ... }, - ... schema_overrides={"a": pl.Int32}, - ... ) - >>> for idx, frame in enumerate(df.iter_slices()): - ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") - ... - DataFrame:[0]:10000 - DataFrame:[1]:7500 - - Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and - any supported frame export/conversion types; for example, as RecordBatches: - - >>> for frame in df.iter_slices(n_rows=15_000): - ... record_batch = frame.to_arrow().to_batches()[0] - ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") - ... - a: int32 - b: date32[day] - c: large_string - << 15000 - a: int32 - b: date32[day] - c: large_string - << 2500 - - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - partition_by : Split into multiple DataFrames, partitioned by groups. - - ''' - def shrink_to_fit(self) -> Self: - """ - Shrink DataFrame memory usage. - - Shrinks to fit the exact capacity needed to hold the data. - - """ - def gather_every(self, n: int) -> DataFrame: - ''' - Take every nth row in the DataFrame and return as a new DataFrame. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) - >>> s.gather_every(2) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 5 │ - │ 3 ┆ 7 │ - └─────┴─────┘ - - ''' - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: - ''' - Hash and combine the rows in this DataFrame. - - The hash value is of type `UInt64`. - - Parameters - ---------- - seed - Random seed parameter. Defaults to 0. - seed_1 - Random seed parameter. Defaults to `seed` if not set. - seed_2 - Random seed parameter. Defaults to `seed` if not set. - seed_3 - Random seed parameter. Defaults to `seed` if not set. - - Notes - ----- - This implementation of :func:`hash_rows` does not guarantee stable results - across different Polars versions. Its stability is only guaranteed within a - single version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, None, 3, 4], - ... "ham": ["a", "b", None, "d"], - ... } - ... ) - >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT - shape: (4,) - Series: \'\' [u64] - [ - 10783150408545073287 - 1438741209321515184 - 10047419486152048166 - 2047317070637311557 - ] - - ''' - def interpolate(self) -> DataFrame: - ''' - Interpolate intermediate values. The interpolation method is linear. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, None, 9, 10], - ... "bar": [6, 7, 9, None], - ... "baz": [1, None, None, 9], - ... } - ... ) - >>> df.interpolate() - shape: (4, 3) - ┌──────┬──────┬──────────┐ - │ foo ┆ bar ┆ baz │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 │ - ╞══════╪══════╪══════════╡ - │ 1.0 ┆ 6.0 ┆ 1.0 │ - │ 5.0 ┆ 7.0 ┆ 3.666667 │ - │ 9.0 ┆ 9.0 ┆ 6.333333 │ - │ 10.0 ┆ null ┆ 9.0 │ - └──────┴──────┴──────────┘ - - ''' - def is_empty(self) -> bool: - ''' - Check if the dataframe is empty. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.is_empty() - False - >>> df.filter(pl.col("foo") > 99).is_empty() - True - - ''' - def to_struct(self, name: str) -> Series: - ''' - Convert a `DataFrame` to a `Series` of type `Struct`. - - Parameters - ---------- - name - Name for the struct Series - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4, 5], - ... "b": ["one", "two", "three", "four", "five"], - ... } - ... ) - >>> df.to_struct("nums") - shape: (5,) - Series: \'nums\' [struct[2]] - [ - {1,"one"} - {2,"two"} - {3,"three"} - {4,"four"} - {5,"five"} - ] - - ''' - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: - ''' - Decompose struct columns into separate columns for each of their fields. - - The new columns will be inserted into the dataframe at the location of the - struct column. - - Parameters - ---------- - columns - Name of the struct column(s) that should be unnested. - *more_columns - Additional columns to unnest, specified as positional arguments. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "before": ["foo", "bar"], - ... "t_a": [1, 2], - ... "t_b": ["a", "b"], - ... "t_c": [True, None], - ... "t_d": [[1, 2], [3]], - ... "after": ["baz", "womp"], - ... } - ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") - >>> df - shape: (2, 3) - ┌────────┬─────────────────────┬───────┐ - │ before ┆ t_struct ┆ after │ - │ --- ┆ --- ┆ --- │ - │ str ┆ struct[4] ┆ str │ - ╞════════╪═════════════════════╪═══════╡ - │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ - │ bar ┆ {2,"b",null,[3]} ┆ womp │ - └────────┴─────────────────────┴───────┘ - >>> df.unnest("t_struct") - shape: (2, 6) - ┌────────┬─────┬─────┬──────┬───────────┬───────┐ - │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ - ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ - │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ - │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ - └────────┴─────┴─────┴──────┴───────────┴───────┘ - - ''' - def corr(self, **kwargs: Any) -> DataFrame: - ''' - Return pairwise Pearson product-moment correlation coefficients between columns. - - See numpy `corrcoef` for more information: - https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html - - Notes - ----- - This functionality requires numpy to be installed. - - Parameters - ---------- - **kwargs - Keyword arguments are passed to numpy `corrcoef`. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) - >>> df.corr() - shape: (3, 3) - ┌──────┬──────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 │ - ╞══════╪══════╪══════╡ - │ 1.0 ┆ -1.0 ┆ 1.0 │ - │ -1.0 ┆ 1.0 ┆ -1.0 │ - │ 1.0 ┆ -1.0 ┆ 1.0 │ - └──────┴──────┴──────┘ - - ''' - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: - ''' - Take two sorted DataFrames and merge them by the sorted key. - - The output of this operation will also be sorted. - It is the callers responsibility that the frames are sorted - by that key otherwise the output will not make sense. - - The schemas of both DataFrames must be equal. - - Parameters - ---------- - other - Other DataFrame that must be merged - key - Key that is sorted. - - Examples - -------- - >>> df0 = pl.DataFrame( - ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} - ... ).sort("age") - >>> df0 - shape: (3, 2) - ┌───────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═══════╪═════╡ - │ bob ┆ 18 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └───────┴─────┘ - >>> df1 = pl.DataFrame( - ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} - ... ).sort("age") - >>> df1 - shape: (4, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - └────────┴─────┘ - >>> df0.merge_sorted(df1, key="age") - shape: (7, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ bob ┆ 18 │ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └────────┴─────┘ - ''' - def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: - """ - Indicate that one or multiple columns are sorted. - - Parameters - ---------- - column - Columns that are sorted - more_columns - Additional columns that are sorted, specified as positional arguments. - descending - Whether the columns are sorted in descending order. - """ - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: - ''' - Update the values in this `DataFrame` with the values in `other`. - - By default, null values in the right dataframe are ignored. Use - `ignore_nulls=False` to overwrite values in this frame with null values in other - frame. - - Notes - ----- - This is syntactic sugar for a left/inner join, with an optional coalesce when - `include_nulls = False`. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Parameters - ---------- - other - DataFrame that will be used to update the values - on - Column names that will be joined on. - If none given the row count is used. - left_on - Join column(s) of the left DataFrame. - right_on - Join column(s) of the right DataFrame. - how : {\'left\', \'inner\', \'outer\'} - * \'left\' will keep all rows from the left table; rows may be duplicated - if multiple rows in the right frame match the left row\'s key. - * \'inner\' keeps only those rows where the key exists in both frames. - * \'outer\' will update existing rows where the key matches while also - adding any new rows contained in the given frame. - include_nulls - If True, null values from the right dataframe will be used to update the - left dataframe. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4], - ... "B": [400, 500, 600, 700], - ... } - ... ) - >>> df - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 400 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - >>> new_df = pl.DataFrame( - ... { - ... "B": [-66, None, -99], - ... "C": [5, 3, 1], - ... } - ... ) - - Update `df` values with the non-null values in `new_df`, by row index: - - >>> df.update(new_df) - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, by row index, - but only keeping those rows that are common to both frames: - - >>> df.update(new_df, how="inner") - shape: (3, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") - shape: (5, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴─────┘ - - Update `df` values including null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> df.update( - ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True - ... ) - shape: (5, 2) - ┌─────┬──────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ null │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴──────┘ - - ''' - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: - """ - Start a group by operation. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.group_by`. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - .. note:: - Within each group, the order of rows is always preserved, regardless - of this argument. - - Returns - ------- - GroupBy - Object which can be used to perform aggregations. - - """ - def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - """ - def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.9 - This method has been renamed to :func:`DataFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - """ - def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.group_by_dynamic`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - include_boundaries - Add the lower and upper bound of the window to the "_lower_bound" and - "_upper_bound" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - DynamicGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - ''' - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: - """ - Apply a custom/user-defined function (UDF) over the rows of the DataFrame. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.map_rows`. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output type of the operation. If none given, Polars tries to infer the type. - inference_size - Only used in the case when the custom function returns rows. - This uses the first `n` rows to determine the output schema - - """ - def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - fill None values with this value. - n - Number of places to shift (may be negative). - - """ - def take_every(self, n: int) -> DataFrame: - """ - Take every nth row in the DataFrame and return as a new DataFrame. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - def find_idx_by_name(self, name: str) -> int: - """ - Find the index of a column by name. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`get_column_index`. - - Parameters - ---------- - name - Name of the column to find. - """ - def insert_at_idx(self, index: int, column: Series) -> Self: - """ - Insert a Series at a certain column index. This operation is in place. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`insert_column`. - - Parameters - ---------- - index - Column to insert the new `Series` column. - column - `Series` to insert. - """ - def replace_at_idx(self, index: int, new_column: Series) -> Self: - """ - Replace a column at an index location. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`replace_column`. - - Parameters - ---------- - index - Column index. - new_column - Series that will replace the column. - """ - def frame_equal(self, other: DataFrame) -> bool: - """ - Check whether the DataFrame is equal to another DataFrame. - - .. deprecated:: 0.19.16 - This method has been renamed to :func:`equals`. - - Parameters - ---------- - other - DataFrame to compare with. - null_equal - Consider null values as equal. - """ - @property - def shape(self): ... - @property - def height(self): ... - @property - def width(self): ... - @property - def dtypes(self): ... - @property - def flags(self): ... - @property - def schema(self): ... -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/dataframe/frame.pyi new file mode 100644 index 0000000..9c10836 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/dataframe/frame.pyi @@ -0,0 +1,7036 @@ +#: version 0.19.19 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use `pl.read_csv` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use `pl.read_parquet` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading `n_rows`. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use `pl.read_json` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use `pl.read_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with `NaN`. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to `True` will raise a `NotImplementedError`. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars DataFrame to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the DataFrame as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to `df[0,0]`, with a check that + the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are Series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + `structured` is set to `False` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + `pyarrow`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + separator or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path or writeable file-like object to which the data will be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + name + Schema name. Defaults to empty string. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open `xlsxwriter.Workbook` object that has not been closed. + If None, writes to a `dataframe.xlsx` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of `{"key":value,}` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. + column_formats : dict + A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. + dtype_formats : dict + A `{dtype:str,}` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + `column_formats` param). It is also valid to use dtype groups such as + `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid `xlsxwriter` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all `xlsxwriter` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A `{key:value,}` dictionary of `xlsxwriter` format options to apply + to the table header row, such as `{"bold":True, "font_color":"#702963"}`. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a `{colname:funcname,}` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A `{colname:int,}` or `{selector:int,}` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a `{colname:columns,}` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or `{row_index:int,}` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that `row_index` starts at zero and will be + the header row (unless `include_header` is False). + sparklines : dict + A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an `xlsxwriter`-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + include_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible `xlsxwriter` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic DataFrame: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC data will be + written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC record batch data will + be written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + data_page_size + Size of the data page in bytes. Defaults to 1024^2 bytes. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to `pyarrow.parquet.write_table`. + + If you pass `partition_cols` here, the dataset will be written + using `pyarrow.parquet.write_to_dataset`. + The `partition_cols` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, table_name: str, connection: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Schema-qualified name of the table to create or append to in the target + SQL database. If your table name contains special characters, it should + be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode: + + * \'replace\' will create a new database table, overwriting an existing one. + * \'append\' will append to an existing table. + * \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Some other data types are not supported but have an associated `primitive type + `__ + to which they can be cast. This affects the following data types: + + - Unsigned integers + - :class:`Datetime` types with millisecond or nanosecond precision or with + time zone information + - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a dataframe as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_column(self, index: int, column: Series) -> Self: + ''' + Insert a Series at a certain column index. + + This operation is in place. + + Parameters + ---------- + index + Index at which to insert the new `Series` column. + column + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_column(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_column(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") > 1) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions, combined with and/or operators: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> df.filter( + ... pl.col("foo") <= 2, + ... ~pl.col("ham").is_in(["b", "c"]), + ... ) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> df.filter(foo=2, ham="b") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def get_column_index(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.get_column_index("ham") + 2 + + ''' + def replace_column(self, index: int, column: Series) -> Self: + ''' + Replace a column at an index location. + + This operation is in place. + + Parameters + ---------- + index + Column index. + column + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_column(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def equals(self, other: DataFrame) -> bool: + ''' + Check whether the DataFrame is equal to another DataFrame. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + See Also + -------- + assert_frame_equal + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.equals(df1) + True + >>> df1.equals(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The `GroupBy` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `group_by_dynamic` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling operation on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see `pl.StringCache()`. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: `udf(row)`. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level `apply` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level `apply` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, other: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of + this `DataFrame`, `extend` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer `vstack` over `extend` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single `DataFrame`. In the latter case, finish the sequence of + `vstack` operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this DataFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Returns + ------- + Series + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill `value`. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> df.melt(id_vars="a", value_vars=cs.numeric()) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to `None` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying `as_dict=True`. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, n: int = ...) -> DataFrame: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> df.shift() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.shift(-2) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.shift(-2, fill_value=100) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`max_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def max_horizontal(self) -> Series: + ''' + Get the maximum value horizontally across columns. + + Returns + ------- + Series + A Series named `"max"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.max_horizontal() + shape: (3,) + Series: \'max\' [f64] + [ + 4.0 + 5.0 + 6.0 + ] + ''' + def min(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`min_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def min_horizontal(self) -> Series: + ''' + Get the minimum value horizontally across columns. + + Returns + ------- + Series + A Series named `"min"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.min_horizontal() + shape: (3,) + Series: \'min\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`sum_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + ''' + def sum_horizontal(self) -> Series: + ''' + Sum all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"sum"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.sum_horizontal() + shape: (3,) + Series: \'sum\' [f64] + [ + 5.0 + 7.0 + 9.0 + ] + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`mean_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + ''' + def mean_horizontal(self) -> Series: + ''' + Take the mean of all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"mean"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.mean_horizontal() + shape: (3,) + Series: \'mean\' [f64] + [ + 2.5 + 3.5 + 4.5 + ] + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to `None` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the `DataFrame` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`DataFrame.approx_n_unique`. + + """ + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + - Int8 + Utf8 = Utf8 + - Float32 + Int64 = Float32 + - Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The `index` and `by_predicate` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using `by_predicate` it is an error condition if anything other than + one row is returned; more than one row raises `TooManyRowsReturnedError`, and + zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of `iter_rows()` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify `named=True` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use `by_predicate` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using `iter_rows` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_columns(self) -> Iterator[Series]: + ''' + Returns an iterator over the DataFrame\'s columns. + + Notes + ----- + Consider whether you can use :func:`all` instead. + If you can, it will be more efficient. + + Returns + ------- + Iterator of Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [s.name for s in df.iter_columns()] + [\'a\', \'b\'] + + If you\'re using this to modify a dataframe\'s columns, e.g. + + >>> # Do NOT do this + >>> pl.DataFrame(column * 2 for column in df.iter_columns()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + + then consider whether you can use :func:`all` instead: + + >>> df.select(pl.all() * 2) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def gather_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.gather_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of :func:`hash_rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a `DataFrame` to a `Series` of type `Struct`. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy `corrcoef` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy `corrcoef`. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the values in `other`. + + By default, null values in the right dataframe are ignored. Use + `ignore_nulls=False` to overwrite values in this frame with null values in other + frame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> df.update(new_df, how="inner") + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update( + ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with this value. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> DataFrame: + """ + Take every nth row in the DataFrame and return as a new DataFrame. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def find_idx_by_name(self, name: str) -> int: + """ + Find the index of a column by name. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`get_column_index`. + + Parameters + ---------- + name + Name of the column to find. + """ + def insert_at_idx(self, index: int, column: Series) -> Self: + """ + Insert a Series at a certain column index. This operation is in place. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`insert_column`. + + Parameters + ---------- + index + Column to insert the new `Series` column. + column + `Series` to insert. + """ + def replace_at_idx(self, index: int, new_column: Series) -> Self: + """ + Replace a column at an index location. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`replace_column`. + + Parameters + ---------- + index + Column index. + new_column + Series that will replace the column. + """ + def frame_equal(self, other: DataFrame) -> bool: + """ + Check whether the DataFrame is equal to another DataFrame. + + .. deprecated:: 0.19.16 + This method has been renamed to :func:`equals`. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/expr/expr deleted file mode 100644 index 5131d44..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/expr/expr +++ /dev/null @@ -1,8289 +0,0 @@ -import P -import np as np -import pl -from builtins import PyExpr -from datetime import timedelta -from polars.datatypes.classes import Categorical as Categorical, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 -from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy -from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import no_default as no_default, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence - -TYPE_CHECKING: bool -py_arg_where: builtin_function_or_method -pyreduce: builtin_function_or_method - -class Expr: - _pyexpr: _ClassVar[None] = ... - _accessors: _ClassVar[set] = ... - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _repr_html_(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int | bool) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int | bool) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr | int | bool) -> Self: ... - def __rxor__(self, other: Any) -> Self: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: - """Numpy universal functions.""" - @classmethod - def from_json(cls, value: str) -> Self: - """ - Read an expression from a JSON encoded string to construct an Expression. - - Parameters - ---------- - value - JSON encoded string value - - """ - def to_physical(self) -> Self: - ''' - Cast to physical representation of the logical dtype. - - - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` - - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` - - `List(inner)` -> `List(physical of inner)` - - Other data types will be left unchanged. - - Examples - -------- - Replicating the pandas - `pd.factorize - `_ - function. - - >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( - ... [ - ... pl.col("vals").cast(pl.Categorical), - ... pl.col("vals") - ... .cast(pl.Categorical) - ... .to_physical() - ... .alias("vals_physical"), - ... ] - ... ) - shape: (4, 2) - ┌──────┬───────────────┐ - │ vals ┆ vals_physical │ - │ --- ┆ --- │ - │ cat ┆ u32 │ - ╞══════╪═══════════════╡ - │ a ┆ 0 │ - │ x ┆ 1 │ - │ null ┆ null │ - │ a ┆ 0 │ - └──────┴───────────────┘ - - ''' - def any(self) -> Self: - ''' - Return whether any of the values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is null. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [True, False], - ... "b": [False, False], - ... "c": [None, False], - ... } - ... ) - >>> df.select(pl.col("*").any()) - shape: (1, 3) - ┌──────┬───────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪═══════╡ - │ true ┆ false ┆ false │ - └──────┴───────┴───────┘ - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> df.select(pl.col("*").any(ignore_nulls=False)) - shape: (1, 3) - ┌──────┬───────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪══════╡ - │ true ┆ false ┆ null │ - └──────┴───────┴──────┘ - - ''' - def all(self) -> Self: - ''' - Return whether all values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - .. note:: - This method is not to be confused with the function :func:`polars.all`, - which can be used to select all columns. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is null. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [True, True], - ... "b": [False, True], - ... "c": [None, True], - ... } - ... ) - >>> df.select(pl.col("*").all()) - shape: (1, 3) - ┌──────┬───────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪══════╡ - │ true ┆ false ┆ true │ - └──────┴───────┴──────┘ - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> df.select(pl.col("*").all(ignore_nulls=False)) - shape: (1, 3) - ┌──────┬───────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪══════╡ - │ true ┆ false ┆ null │ - └──────┴───────┴──────┘ - - ''' - def arg_true(self) -> Self: - ''' - Return indices where expression evaluates `True`. - - .. warning:: - Modifies number of rows returned, so will fail in combination with other - expressions. Use as only expression in `select` / `with_columns`. - - See Also - -------- - Series.arg_true : Return indices where Series is True - polars.arg_where - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) - >>> df.select((pl.col("a") == 1).arg_true()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 0 │ - │ 1 │ - │ 3 │ - └─────┘ - - ''' - def sqrt(self) -> Self: - ''' - Compute the square root of the elements. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").sqrt()) - shape: (3, 1) - ┌──────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.0 │ - │ 1.414214 │ - │ 2.0 │ - └──────────┘ - - ''' - def cbrt(self) -> Self: - ''' - Compute the cube root of the elements. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").cbrt()) - shape: (3, 1) - ┌──────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.0 │ - │ 1.259921 │ - │ 1.587401 │ - └──────────┘ - - ''' - def log10(self) -> Self: - ''' - Compute the base 10 logarithm of the input array, element-wise. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").log10()) - shape: (3, 1) - ┌─────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞═════════╡ - │ 0.0 │ - │ 0.30103 │ - │ 0.60206 │ - └─────────┘ - - ''' - def exp(self) -> Self: - ''' - Compute the exponential, element-wise. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").exp()) - shape: (3, 1) - ┌──────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 2.718282 │ - │ 7.389056 │ - │ 54.59815 │ - └──────────┘ - - ''' - def alias(self, name: str) -> Self: - ''' - Rename the expression. - - Parameters - ---------- - name - The new name. - - See Also - -------- - map - prefix - suffix - - Examples - -------- - Rename an expression to avoid overwriting an existing column. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["x", "y", "z"], - ... } - ... ) - >>> df.with_columns( - ... pl.col("a") + 10, - ... pl.col("b").str.to_uppercase().alias("c"), - ... ) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 11 ┆ x ┆ X │ - │ 12 ┆ y ┆ Y │ - │ 13 ┆ z ┆ Z │ - └─────┴─────┴─────┘ - - Overwrite the default name of literal columns to prevent errors due to duplicate - column names. - - >>> df.with_columns( - ... pl.lit(True).alias("c"), - ... pl.lit(4.0).alias("d"), - ... ) - shape: (3, 4) - ┌─────┬─────┬──────┬─────┐ - │ a ┆ b ┆ c ┆ d │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ bool ┆ f64 │ - ╞═════╪═════╪══════╪═════╡ - │ 1 ┆ x ┆ true ┆ 4.0 │ - │ 2 ┆ y ┆ true ┆ 4.0 │ - │ 3 ┆ z ┆ true ┆ 4.0 │ - └─────┴─────┴──────┴─────┘ - - ''' - def map_alias(self, function: Callable[[str], str]) -> Self: - ''' - Rename the output of an expression by mapping a function over the root name. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.map`. - - Parameters - ---------- - function - Function that maps a root name to a new name. - - See Also - -------- - keep_name - prefix - suffix - - Examples - -------- - Remove a common suffix and convert to lower case. - - >>> df = pl.DataFrame( - ... { - ... "A_reverse": [3, 2, 1], - ... "B_reverse": ["z", "y", "x"], - ... } - ... ) - >>> df.with_columns( - ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) - ... ) - shape: (3, 4) - ┌───────────┬───────────┬─────┬─────┐ - │ A_reverse ┆ B_reverse ┆ a ┆ b │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═══════════╪═══════════╪═════╪═════╡ - │ 3 ┆ z ┆ 1 ┆ x │ - │ 2 ┆ y ┆ 2 ┆ y │ - │ 1 ┆ x ┆ 3 ┆ z │ - └───────────┴───────────┴─────┴─────┘ - - ''' - def prefix(self, prefix: str) -> Self: - ''' - Add a prefix to the root column name of the expression. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.prefix`. - - Parameters - ---------- - prefix - Prefix to add to the root column name. - - Notes - ----- - This will undo any previous renaming operations on the expression. - - Due to implementation constraints, this method can only be called as the last - expression in a chain. - - See Also - -------- - suffix - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["x", "y", "z"], - ... } - ... ) - >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) - shape: (3, 4) - ┌─────┬─────┬───────────┬───────────┐ - │ a ┆ b ┆ reverse_a ┆ reverse_b │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪═════╪═══════════╪═══════════╡ - │ 1 ┆ x ┆ 3 ┆ z │ - │ 2 ┆ y ┆ 2 ┆ y │ - │ 3 ┆ z ┆ 1 ┆ x │ - └─────┴─────┴───────────┴───────────┘ - - ''' - def suffix(self, suffix: str) -> Self: - ''' - Add a suffix to the root column name of the expression. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.suffix`. - - Parameters - ---------- - suffix - Suffix to add to the root column name. - - Notes - ----- - This will undo any previous renaming operations on the expression. - - Due to implementation constraints, this method can only be called as the last - expression in a chain. - - See Also - -------- - prefix - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["x", "y", "z"], - ... } - ... ) - >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) - shape: (3, 4) - ┌─────┬─────┬───────────┬───────────┐ - │ a ┆ b ┆ a_reverse ┆ b_reverse │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪═════╪═══════════╪═══════════╡ - │ 1 ┆ x ┆ 3 ┆ z │ - │ 2 ┆ y ┆ 2 ┆ y │ - │ 3 ┆ z ┆ 1 ┆ x │ - └─────┴─────┴───────────┴───────────┘ - - ''' - def keep_name(self) -> Self: - ''' - Keep the original root name of the expression. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.keep`. - - Notes - ----- - Due to implementation constraints, this method can only be called as the last - expression in a chain. - - See Also - -------- - alias - - Examples - -------- - Undo an alias operation. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2], - ... "b": [3, 4], - ... } - ... ) - >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 9 ┆ 3 │ - │ 18 ┆ 4 │ - └─────┴─────┘ - - Prevent errors due to duplicate column names. - - >>> df.select((pl.lit(10) / pl.all()).name.keep()) - shape: (2, 2) - ┌──────┬──────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪══════════╡ - │ 10.0 ┆ 3.333333 │ - │ 5.0 ┆ 2.5 │ - └──────┴──────────┘ - - ''' - def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: - ''' - Exclude columns from a multi-column expression. - - Only works after a wildcard or regex column selection, and you cannot provide - both string column names *and* dtypes (you may prefer to use selectors instead). - - Parameters - ---------- - columns - The name or datatype of the column(s) to exclude. Accepts regular expression - input. Regular expressions should start with `^` and end with `$`. - *more_columns - Additional names or datatypes of columns to exclude, specified as positional - arguments. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "aa": [1, 2, 3], - ... "ba": ["a", "b", None], - ... "cc": [None, 2.5, 1.5], - ... } - ... ) - >>> df - shape: (3, 3) - ┌─────┬──────┬──────┐ - │ aa ┆ ba ┆ cc │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ f64 │ - ╞═════╪══════╪══════╡ - │ 1 ┆ a ┆ null │ - │ 2 ┆ b ┆ 2.5 │ - │ 3 ┆ null ┆ 1.5 │ - └─────┴──────┴──────┘ - - Exclude by column name(s): - - >>> df.select(pl.all().exclude("ba")) - shape: (3, 2) - ┌─────┬──────┐ - │ aa ┆ cc │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ null │ - │ 2 ┆ 2.5 │ - │ 3 ┆ 1.5 │ - └─────┴──────┘ - - Exclude by regex, e.g. removing all columns whose names end with the letter "a": - - >>> df.select(pl.all().exclude("^.*a$")) - shape: (3, 1) - ┌──────┐ - │ cc │ - │ --- │ - │ f64 │ - ╞══════╡ - │ null │ - │ 2.5 │ - │ 1.5 │ - └──────┘ - - Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: - - >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) - shape: (3, 1) - ┌──────┐ - │ ba │ - │ --- │ - │ str │ - ╞══════╡ - │ a │ - │ b │ - │ null │ - └──────┘ - - ''' - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: - ''' - Offers a structured way to apply a sequence of user-defined functions (UDFs). - - Parameters - ---------- - function - Callable; will receive the expression as the first parameter, - followed by any given args/kwargs. - *args - Arguments to pass to the UDF. - **kwargs - Keyword arguments to pass to the UDF. - - Examples - -------- - >>> def extract_number(expr: pl.Expr) -> pl.Expr: - ... """Extract the digits from a string.""" - ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) - >>> - >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: - ... """Set even numbers negative, and scale by a user-supplied value.""" - ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) - ... return expr * n - >>> - >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) - >>> df.with_columns( - ... udfs=( - ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) - ... ), - ... ) - shape: (4, 2) - ┌──────┬──────┐ - │ val ┆ udfs │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞══════╪══════╡ - │ a: 1 ┆ 5 │ - │ b: 2 ┆ -10 │ - │ c: 3 ┆ 15 │ - │ d: 4 ┆ -20 │ - └──────┴──────┘ - - ''' - def is_not(self) -> Self: - """ - Negate a boolean expression. - - .. deprecated:: 0.19.2 - This method has been renamed to :func:`Expr.not_`. - - """ - def not_(self) -> Self: - ''' - Negate a boolean expression. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [True, False, False], - ... "b": ["a", "b", None], - ... } - ... ) - >>> df - shape: (3, 2) - ┌───────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ bool ┆ str │ - ╞═══════╪══════╡ - │ true ┆ a │ - │ false ┆ b │ - │ false ┆ null │ - └───────┴──────┘ - >>> df.select(pl.col("a").not_()) - shape: (3, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ true │ - │ true │ - └───────┘ - - ''' - def is_null(self) -> Self: - ''' - Returns a boolean Series indicating which values are null. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null - shape: (5, 4) - ┌──────┬─────┬──────────┬──────────┐ - │ a ┆ b ┆ a_isnull ┆ b_isnull │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪═════╪══════════╪══════════╡ - │ 1 ┆ 1.0 ┆ false ┆ false │ - │ 2 ┆ 2.0 ┆ false ┆ false │ - │ null ┆ NaN ┆ true ┆ false │ - │ 1 ┆ 1.0 ┆ false ┆ false │ - │ 5 ┆ 5.0 ┆ false ┆ false │ - └──────┴─────┴──────────┴──────────┘ - - ''' - def is_not_null(self) -> Self: - ''' - Returns a boolean Series indicating which values are not null. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns( - ... pl.all().is_not_null().name.suffix("_not_null") # nan != null - ... ) - shape: (5, 4) - ┌──────┬─────┬────────────┬────────────┐ - │ a ┆ b ┆ a_not_null ┆ b_not_null │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪═════╪════════════╪════════════╡ - │ 1 ┆ 1.0 ┆ true ┆ true │ - │ 2 ┆ 2.0 ┆ true ┆ true │ - │ null ┆ NaN ┆ false ┆ true │ - │ 1 ┆ 1.0 ┆ true ┆ true │ - │ 5 ┆ 5.0 ┆ true ┆ true │ - └──────┴─────┴────────────┴────────────┘ - - ''' - def is_finite(self) -> Self: - ''' - Returns a boolean Series indicating which values are finite. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1.0, 2], - ... "B": [3.0, float("inf")], - ... } - ... ) - >>> df.select(pl.all().is_finite()) - shape: (2, 2) - ┌──────┬───────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ bool ┆ bool │ - ╞══════╪═══════╡ - │ true ┆ true │ - │ true ┆ false │ - └──────┴───────┘ - - ''' - def is_infinite(self) -> Self: - ''' - Returns a boolean Series indicating which values are infinite. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1.0, 2], - ... "B": [3.0, float("inf")], - ... } - ... ) - >>> df.select(pl.all().is_infinite()) - shape: (2, 2) - ┌───────┬───────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ bool ┆ bool │ - ╞═══════╪═══════╡ - │ false ┆ false │ - │ false ┆ true │ - └───────┴───────┘ - - ''' - def is_nan(self) -> Self: - ''' - Returns a boolean Series indicating which values are NaN. - - Notes - ----- - Floating point `NaN` (Not A Number) should not be confused - with missing data represented as `Null/None`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) - shape: (5, 3) - ┌──────┬─────┬─────────┐ - │ a ┆ b ┆ b_isnan │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪═════╪═════════╡ - │ 1 ┆ 1.0 ┆ false │ - │ 2 ┆ 2.0 ┆ false │ - │ null ┆ NaN ┆ true │ - │ 1 ┆ 1.0 ┆ false │ - │ 5 ┆ 5.0 ┆ false │ - └──────┴─────┴─────────┘ - - ''' - def is_not_nan(self) -> Self: - ''' - Returns a boolean Series indicating which values are not NaN. - - Notes - ----- - Floating point `NaN` (Not A Number) should not be confused - with missing data represented as `Null/None`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) - shape: (5, 3) - ┌──────┬─────┬──────────────┐ - │ a ┆ b ┆ b_is_not_nan │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪═════╪══════════════╡ - │ 1 ┆ 1.0 ┆ true │ - │ 2 ┆ 2.0 ┆ true │ - │ null ┆ NaN ┆ false │ - │ 1 ┆ 1.0 ┆ true │ - │ 5 ┆ 5.0 ┆ true │ - └──────┴─────┴──────────────┘ - - ''' - def agg_groups(self) -> Self: - ''' - Get the group indexes of the group by operation. - - Should be used in aggregation context only. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": [ - ... "one", - ... "one", - ... "one", - ... "two", - ... "two", - ... "two", - ... ], - ... "value": [94, 95, 96, 97, 97, 99], - ... } - ... ) - >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ list[u32] │ - ╞═══════╪═══════════╡ - │ one ┆ [0, 1, 2] │ - │ two ┆ [3, 4, 5] │ - └───────┴───────────┘ - - ''' - def count(self) -> Self: - ''' - Return the number of elements in the column. - - .. warning:: - Null values are treated like regular elements in this context. - - Examples - -------- - >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) - >>> df.select(pl.all().count()) - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 3 ┆ 3 │ - └─────┴─────┘ - - ''' - def len(self) -> Self: - ''' - Return the number of elements in the column. - - Null values are treated like regular elements in this context. - - Alias for :func:`count`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) - >>> df.select(pl.all().len()) - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 3 ┆ 3 │ - └─────┴─────┘ - - ''' - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: - ''' - Get a slice of this expression. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [8, 9, 10, 11], - ... "b": [None, 4, 4, 4], - ... } - ... ) - >>> df.select(pl.all().slice(1, 2)) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 9 ┆ 4 │ - │ 10 ┆ 4 │ - └─────┴─────┘ - - ''' - def append(self, other: IntoExpr) -> Self: - ''' - Append expressions. - - This is done by adding the chunks of `other` to this `Series`. - - Parameters - ---------- - other - Expression to append. - upcast - Cast both `Series` to the same supertype. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [8, 9, 10], - ... "b": [None, 4, 4], - ... } - ... ) - >>> df.select(pl.all().head(1).append(pl.all().tail(1))) - shape: (2, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 8 ┆ null │ - │ 10 ┆ 4 │ - └─────┴──────┘ - - ''' - def rechunk(self) -> Self: - ''' - Create a single chunk of memory for this Series. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - - Create a Series with 3 nulls, append column a then rechunk - - >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) - shape: (6, 1) - ┌────────┐ - │ repeat │ - │ --- │ - │ i64 │ - ╞════════╡ - │ null │ - │ null │ - │ null │ - │ 1 │ - │ 1 │ - │ 2 │ - └────────┘ - - ''' - def drop_nulls(self) -> Self: - ''' - Drop all null values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nans - - Notes - ----- - A null value is not the same as a NaN value. - To drop NaN values, use :func:`drop_nans`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) - >>> df.select(pl.col("a").drop_nulls()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - │ 3.0 │ - │ NaN │ - └─────┘ - - ''' - def drop_nans(self) -> Self: - ''' - Drop all floating point NaN values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nulls - - Notes - ----- - A NaN value is not the same as a null value. - To drop null values, use :func:`drop_nulls`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) - >>> df.select(pl.col("a").drop_nans()) - shape: (3, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ 1.0 │ - │ null │ - │ 3.0 │ - └──────┘ - - ''' - def cum_sum(self) -> Self: - ''' - Get an array with the cumulative sum computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_sum().alias("cum_sum"), - ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), - ... ) - shape: (4, 3) - ┌─────┬─────────┬─────────────────┐ - │ a ┆ cum_sum ┆ cum_sum_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════════╪═════════════════╡ - │ 1 ┆ 1 ┆ 10 │ - │ 2 ┆ 3 ┆ 9 │ - │ 3 ┆ 6 ┆ 7 │ - │ 4 ┆ 10 ┆ 4 │ - └─────┴─────────┴─────────────────┘ - - Null values are excluded, but can also be filled by calling `forward_fill`. - - >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) - >>> df.with_columns( - ... pl.col("values").cum_sum().alias("value_cum_sum"), - ... pl.col("values") - ... .cum_sum() - ... .forward_fill() - ... .alias("value_cum_sum_all_filled"), - ... ) - shape: (8, 3) - ┌────────┬───────────────┬──────────────────────────┐ - │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞════════╪═══════════════╪══════════════════════════╡ - │ null ┆ null ┆ null │ - │ 10 ┆ 10 ┆ 10 │ - │ null ┆ null ┆ 10 │ - │ 8 ┆ 18 ┆ 18 │ - │ 9 ┆ 27 ┆ 27 │ - │ null ┆ null ┆ 27 │ - │ 16 ┆ 43 ┆ 43 │ - │ null ┆ null ┆ 43 │ - └────────┴───────────────┴──────────────────────────┘ - - ''' - def cum_prod(self) -> Self: - ''' - Get an array with the cumulative product computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_prod().alias("cum_prod"), - ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), - ... ) - shape: (4, 3) - ┌─────┬──────────┬──────────────────┐ - │ a ┆ cum_prod ┆ cum_prod_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪══════════╪══════════════════╡ - │ 1 ┆ 1 ┆ 24 │ - │ 2 ┆ 2 ┆ 24 │ - │ 3 ┆ 6 ┆ 12 │ - │ 4 ┆ 24 ┆ 4 │ - └─────┴──────────┴──────────────────┘ - - ''' - def cum_min(self) -> Self: - ''' - Get an array with the cumulative min computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_min().alias("cum_min"), - ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), - ... ) - shape: (4, 3) - ┌─────┬─────────┬─────────────────┐ - │ a ┆ cum_min ┆ cum_min_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════════╪═════════════════╡ - │ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 1 ┆ 2 │ - │ 3 ┆ 1 ┆ 3 │ - │ 4 ┆ 1 ┆ 4 │ - └─────┴─────────┴─────────────────┘ - - ''' - def cum_max(self) -> Self: - ''' - Get an array with the cumulative max computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_max().alias("cum_max"), - ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), - ... ) - shape: (4, 3) - ┌─────┬─────────┬─────────────────┐ - │ a ┆ cum_max ┆ cum_max_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════════╪═════════════════╡ - │ 1 ┆ 1 ┆ 4 │ - │ 2 ┆ 2 ┆ 4 │ - │ 3 ┆ 3 ┆ 4 │ - │ 4 ┆ 4 ┆ 4 │ - └─────┴─────────┴─────────────────┘ - - Null values are excluded, but can also be filled by calling `forward_fill`. - - >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) - >>> df.with_columns( - ... pl.col("values").cum_max().alias("cum_max"), - ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), - ... ) - shape: (8, 3) - ┌────────┬─────────┬────────────────────┐ - │ values ┆ cum_max ┆ cum_max_all_filled │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞════════╪═════════╪════════════════════╡ - │ null ┆ null ┆ null │ - │ 10 ┆ 10 ┆ 10 │ - │ null ┆ null ┆ 10 │ - │ 8 ┆ 10 ┆ 10 │ - │ 9 ┆ 10 ┆ 10 │ - │ null ┆ null ┆ 10 │ - │ 16 ┆ 16 ┆ 16 │ - │ null ┆ null ┆ 16 │ - └────────┴─────────┴────────────────────┘ - - ''' - def cum_count(self) -> Self: - ''' - Get an array with the cumulative count computed at every element. - - Counting from 0 to len - - Parameters - ---------- - reverse - Reverse the operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_count().alias("cum_count"), - ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), - ... ) - shape: (4, 3) - ┌─────┬───────────┬───────────────────┐ - │ a ┆ cum_count ┆ cum_count_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ u32 ┆ u32 │ - ╞═════╪═══════════╪═══════════════════╡ - │ 1 ┆ 0 ┆ 3 │ - │ 2 ┆ 1 ┆ 2 │ - │ 3 ┆ 2 ┆ 1 │ - │ 4 ┆ 3 ┆ 0 │ - └─────┴───────────┴───────────────────┘ - - ''' - def floor(self) -> Self: - ''' - Rounds down to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) - >>> df.select(pl.col("a").floor()) - shape: (4, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - │ 0.0 │ - │ 1.0 │ - │ 1.0 │ - └─────┘ - - ''' - def ceil(self) -> Self: - ''' - Rounds up to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) - >>> df.select(pl.col("a").ceil()) - shape: (4, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - │ 1.0 │ - │ 1.0 │ - │ 2.0 │ - └─────┘ - - ''' - def round(self, decimals: int = ...) -> Self: - ''' - Round underlying floating point data by `decimals` digits. - - Parameters - ---------- - decimals - Number of decimals to round by. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) - >>> df.select(pl.col("a").round(1)) - shape: (4, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.3 │ - │ 0.5 │ - │ 1.0 │ - │ 1.2 │ - └─────┘ - - ''' - def round_sig_figs(self, digits: int) -> Self: - ''' - Round to a number of significant figures. - - Parameters - ---------- - digits - Number of significant figures to round to. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) - >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) - shape: (3, 2) - ┌─────────┬────────────────┐ - │ a ┆ round_sig_figs │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════════╪════════════════╡ - │ 0.01234 ┆ 0.012 │ - │ 3.333 ┆ 3.3 │ - │ 1234.0 ┆ 1200.0 │ - └─────────┴────────────────┘ - - ''' - def dot(self, other: Expr | str) -> Self: - ''' - Compute the dot/inner product between two Expressions. - - Parameters - ---------- - other - Expression to compute dot product with. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> df.select(pl.col("a").dot(pl.col("b"))) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 44 │ - └─────┘ - - ''' - def mode(self) -> Self: - ''' - Compute the most occurring value(s). - - Can return multiple Values. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 1, 2, 3], - ... "b": [1, 1, 2, 2], - ... } - ... ) - >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 1 │ - │ 1 ┆ 2 │ - └─────┴─────┘ - - ''' - def cast(self, dtype: PolarsDataType | type[Any]) -> Self: - ''' - Cast between data types. - - Parameters - ---------- - dtype - DataType to cast to. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["4", "5", "6"], - ... } - ... ) - >>> df.with_columns( - ... [ - ... pl.col("a").cast(pl.Float64), - ... pl.col("b").cast(pl.Int32), - ... ] - ... ) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ i32 │ - ╞═════╪═════╡ - │ 1.0 ┆ 4 │ - │ 2.0 ┆ 5 │ - │ 3.0 ┆ 6 │ - └─────┴─────┘ - - ''' - def sort(self) -> Self: - ''' - Sort this column. - - When used in a projection/selection context, the whole column is sorted. - When used in a group by context, the groups are sorted. - - Parameters - ---------- - descending - Sort in descending order. - nulls_last - Place null values last. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, None, 3, 2], - ... } - ... ) - >>> df.select(pl.col("a").sort()) - shape: (4, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ null │ - │ 1 │ - │ 2 │ - │ 3 │ - └──────┘ - >>> df.select(pl.col("a").sort(descending=True)) - shape: (4, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ null │ - │ 3 │ - │ 2 │ - │ 1 │ - └──────┘ - >>> df.select(pl.col("a").sort(nulls_last=True)) - shape: (4, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ 1 │ - │ 2 │ - │ 3 │ - │ null │ - └──────┘ - - When sorting in a group by context, the groups are sorted. - - >>> df = pl.DataFrame( - ... { - ... "group": ["one", "one", "one", "two", "two", "two"], - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT - shape: (2, 2) - ┌───────┬────────────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪════════════╡ - │ two ┆ [3, 4, 99] │ - │ one ┆ [1, 2, 98] │ - └───────┴────────────┘ - - ''' - def top_k(self, k: int | IntoExprColumn = ...) -> Self: - ''' - Return the `k` largest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - bottom_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("value").top_k().alias("top_k"), - ... pl.col("value").bottom_k().alias("bottom_k"), - ... ] - ... ) - shape: (5, 2) - ┌───────┬──────────┐ - │ top_k ┆ bottom_k │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═══════╪══════════╡ - │ 99 ┆ 1 │ - │ 98 ┆ 2 │ - │ 4 ┆ 3 │ - │ 3 ┆ 4 │ - │ 2 ┆ 98 │ - └───────┴──────────┘ - - ''' - def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: - ''' - Return the `k` smallest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - top_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("value").top_k().alias("top_k"), - ... pl.col("value").bottom_k().alias("bottom_k"), - ... ] - ... ) - shape: (5, 2) - ┌───────┬──────────┐ - │ top_k ┆ bottom_k │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═══════╪══════════╡ - │ 99 ┆ 1 │ - │ 98 ┆ 2 │ - │ 4 ┆ 3 │ - │ 3 ┆ 4 │ - │ 2 ┆ 98 │ - └───────┴──────────┘ - - ''' - def arg_sort(self) -> Self: - ''' - Get the index values that would sort this column. - - Parameters - ---------- - descending - Sort in descending (descending) order. - nulls_last - Place null values last instead of first. - - Returns - ------- - Expr - Expression of data type :class:`UInt32`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [20, 10, 30], - ... } - ... ) - >>> df.select(pl.col("a").arg_sort()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 1 │ - │ 0 │ - │ 2 │ - └─────┘ - - ''' - def arg_max(self) -> Self: - ''' - Get the index of the maximal value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [20, 10, 30], - ... } - ... ) - >>> df.select(pl.col("a").arg_max()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def arg_min(self) -> Self: - ''' - Get the index of the minimal value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [20, 10, 30], - ... } - ... ) - >>> df.select(pl.col("a").arg_min()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 1 │ - └─────┘ - - ''' - def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: - ''' - Find indices where elements should be inserted to maintain order. - - .. math:: a[i-1] < v <= a[i] - - Parameters - ---------- - element - Expression or scalar value. - side : {\'any\', \'left\', \'right\'} - If \'any\', the index of the first suitable location found is given. - If \'left\', the index of the leftmost suitable location found is given. - If \'right\', return the rightmost suitable location found is given. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "values": [1, 2, 3, 5], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("values").search_sorted(0).alias("zero"), - ... pl.col("values").search_sorted(3).alias("three"), - ... pl.col("values").search_sorted(6).alias("six"), - ... ] - ... ) - shape: (1, 3) - ┌──────┬───────┬─────┐ - │ zero ┆ three ┆ six │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ u32 │ - ╞══════╪═══════╪═════╡ - │ 0 ┆ 2 ┆ 4 │ - └──────┴───────┴─────┘ - - ''' - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: - ''' - Sort this column by the ordering of other columns. - - When used in a projection/selection context, the whole column is sorted. - When used in a group by context, the groups are sorted. - - Parameters - ---------- - by - Column(s) to sort by. Accepts expression input. Strings are parsed as column - names. - *more_by - Additional columns to sort by, specified as positional arguments. - descending - Sort in descending order. When sorting by multiple columns, can be specified - per column by passing a sequence of booleans. - - Examples - -------- - Pass a single column name to sort by that column. - - >>> df = pl.DataFrame( - ... { - ... "group": ["a", "a", "b", "b"], - ... "value1": [1, 3, 4, 2], - ... "value2": [8, 7, 6, 5], - ... } - ... ) - >>> df.select(pl.col("group").sort_by("value1")) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ a │ - │ b │ - │ a │ - │ b │ - └───────┘ - - Sorting by expressions is also supported. - - >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ b │ - │ a │ - │ a │ - │ b │ - └───────┘ - - Sort by multiple columns by passing a list of columns. - - >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ b │ - │ a │ - │ b │ - │ a │ - └───────┘ - - Or use positional arguments to sort by multiple columns in the same way. - - >>> df.select(pl.col("group").sort_by("value1", "value2")) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ a │ - │ b │ - │ a │ - │ b │ - └───────┘ - - When sorting in a group by context, the groups are sorted. - - >>> df.group_by("group").agg( - ... pl.col("value1").sort_by("value2") - ... ) # doctest: +IGNORE_RESULT - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ value1 │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪═══════════╡ - │ a ┆ [3, 1] │ - │ b ┆ [2, 4] │ - └───────┴───────────┘ - - Take a single row from each group where a column attains its minimal value - within that group. - - >>> df.group_by("group").agg( - ... pl.all().sort_by("value2").first() - ... ) # doctest: +IGNORE_RESULT - shape: (2, 3) - ┌───────┬────────┬────────┐ - │ group ┆ value1 ┆ value2 | - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 | - ╞═══════╪════════╪════════╡ - │ a ┆ 3 ┆ 7 | - │ b ┆ 2 ┆ 5 | - └───────┴────────┴────────┘ - - ''' - def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: - ''' - Take values by index. - - Parameters - ---------- - indices - An expression that leads to a UInt32 dtyped Series. - - Returns - ------- - Expr - Expression of the same data type. - - See Also - -------- - Expr.get : Take a single value - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": [ - ... "one", - ... "one", - ... "one", - ... "two", - ... "two", - ... "two", - ... ], - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.group_by("group", maintain_order=True).agg( - ... pl.col("value").gather([2, 1]) - ... ) - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪═══════════╡ - │ one ┆ [2, 98] │ - │ two ┆ [4, 99] │ - └───────┴───────────┘ - ''' - def get(self, index: int | Expr) -> Self: - ''' - Return a single value by index. - - Parameters - ---------- - index - An expression that leads to a UInt32 index. - - Returns - ------- - Expr - Expression of the same data type. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": [ - ... "one", - ... "one", - ... "one", - ... "two", - ... "two", - ... "two", - ... ], - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) - shape: (2, 2) - ┌───────┬───────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═══════╪═══════╡ - │ one ┆ 98 │ - │ two ┆ 99 │ - └───────┴───────┘ - - ''' - def shift(self, n: int | IntoExprColumn = ...) -> Self: - ''' - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns(shift=pl.col("a").shift()) - shape: (4, 2) - ┌─────┬───────┐ - │ a ┆ shift │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═══════╡ - │ 1 ┆ null │ - │ 2 ┆ 1 │ - │ 3 ┆ 2 │ - │ 4 ┆ 3 │ - └─────┴───────┘ - - Pass a negative value to shift in the opposite direction instead. - - >>> df.with_columns(shift=pl.col("a").shift(-2)) - shape: (4, 2) - ┌─────┬───────┐ - │ a ┆ shift │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═══════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - │ 3 ┆ null │ - │ 4 ┆ null │ - └─────┴───────┘ - - Specify `fill_value` to fill the resulting null values. - - >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) - shape: (4, 2) - ┌─────┬───────┐ - │ a ┆ shift │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═══════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - │ 3 ┆ 100 │ - │ 4 ┆ 100 │ - └─────┴───────┘ - - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: - ''' - Fill null values using the specified value or strategy. - - To interpolate over null values see interpolate. - See the examples below to fill nulls with an expression. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [4, None, 6], - ... } - ... ) - >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 0 │ - │ null ┆ 6 │ - └──────┴─────┘ - >>> df.with_columns(pl.col("b").fill_null(99)) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 99 │ - │ null ┆ 6 │ - └──────┴─────┘ - >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 4 │ - │ null ┆ 6 │ - └──────┴─────┘ - >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞══════╪═════╡ - │ 1 ┆ 4.0 │ - │ 2 ┆ 5.0 │ - │ null ┆ 6.0 │ - └──────┴─────┘ - >>> df.with_columns(pl.all().fill_null(pl.all().median())) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 1.0 ┆ 4.0 │ - │ 2.0 ┆ 5.0 │ - │ 1.5 ┆ 6.0 │ - └─────┴─────┘ - - ''' - def fill_nan(self, value: int | float | Expr | None) -> Self: - ''' - Fill floating point NaN value with a fill value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1.0, None, float("nan")], - ... "b": [4.0, float("nan"), 6], - ... } - ... ) - >>> df.with_columns(pl.col("b").fill_nan(0)) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪═════╡ - │ 1.0 ┆ 4.0 │ - │ null ┆ 0.0 │ - │ NaN ┆ 6.0 │ - └──────┴─────┘ - - ''' - def forward_fill(self, limit: int | None = ...) -> Self: - ''' - Fill missing values with the latest seen values. - - Parameters - ---------- - limit - The number of consecutive null values to forward fill. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [4, None, 6], - ... } - ... ) - >>> df.select(pl.all().forward_fill()) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 4 │ - │ 2 ┆ 6 │ - └─────┴─────┘ - - ''' - def backward_fill(self, limit: int | None = ...) -> Self: - ''' - Fill missing values with the next to be seen values. - - Parameters - ---------- - limit - The number of consecutive null values to backward fill. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [4, None, 6], - ... "c": [None, None, 2], - ... } - ... ) - >>> df.select(pl.all().backward_fill()) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞══════╪═════╪═════╡ - │ 1 ┆ 4 ┆ 2 │ - │ 2 ┆ 6 ┆ 2 │ - │ null ┆ 6 ┆ 2 │ - └──────┴─────┴─────┘ - >>> df.select(pl.all().backward_fill(limit=1)) - shape: (3, 3) - ┌──────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞══════╪═════╪══════╡ - │ 1 ┆ 4 ┆ null │ - │ 2 ┆ 6 ┆ 2 │ - │ null ┆ 6 ┆ 2 │ - └──────┴─────┴──────┘ - - ''' - def reverse(self) -> Self: - ''' - Reverse the selection. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4, 5], - ... "fruits": ["banana", "banana", "apple", "apple", "banana"], - ... "B": [5, 4, 3, 2, 1], - ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], - ... } - ... ) - >>> df.select( - ... [ - ... pl.all(), - ... pl.all().reverse().name.suffix("_reverse"), - ... ] - ... ) - shape: (5, 8) - ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ - │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ - │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ - │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ - │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ - │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ - │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ - └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ - - ''' - def std(self, ddof: int = ...) -> Self: - ''' - Get standard deviation. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").std()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def var(self, ddof: int = ...) -> Self: - ''' - Get variance. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").var()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def max(self) -> Self: - ''' - Get maximum value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) - >>> df.select(pl.col("a").max()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def min(self) -> Self: - ''' - Get minimum value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) - >>> df.select(pl.col("a").min()) - shape: (1, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ -1.0 │ - └──────┘ - - ''' - def nan_max(self) -> Self: - ''' - Get maximum value, but propagate/poison encountered NaN values. - - This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0, float("nan")]}) - >>> df.select(pl.col("a").nan_max()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ NaN │ - └─────┘ - - ''' - def nan_min(self) -> Self: - ''' - Get minimum value, but propagate/poison encountered NaN values. - - This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0, float("nan")]}) - >>> df.select(pl.col("a").nan_min()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ NaN │ - └─────┘ - - ''' - def sum(self) -> Self: - ''' - Get sum value. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").sum()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 0 │ - └─────┘ - - ''' - def mean(self) -> Self: - ''' - Get mean value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").mean()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def median(self) -> Self: - ''' - Get median value using linear interpolation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").median()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def product(self) -> Self: - ''' - Compute the product of an expression. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").product()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 6 │ - └─────┘ - - ''' - def n_unique(self) -> Self: - ''' - Count unique values. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").n_unique()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def approx_n_unique(self) -> Self: - ''' - Approximate count of unique values. - - This is done using the HyperLogLog++ algorithm for cardinality estimation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").approx_n_unique()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def null_count(self) -> Self: - ''' - Count null values. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [None, 1, None], - ... "b": [1, 2, 3], - ... } - ... ) - >>> df.select(pl.all().null_count()) - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 2 ┆ 0 │ - └─────┴─────┘ - - ''' - def arg_unique(self) -> Self: - ''' - Get index of first unique value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [8, 9, 10], - ... "b": [None, 4, 4], - ... } - ... ) - >>> df.select(pl.col("a").arg_unique()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 0 │ - │ 1 │ - │ 2 │ - └─────┘ - >>> df.select(pl.col("b").arg_unique()) - shape: (2, 1) - ┌─────┐ - │ b │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 0 │ - │ 1 │ - └─────┘ - - ''' - def unique(self) -> Self: - ''' - Get unique values of this expression. - - Parameters - ---------- - maintain_order - Maintain order of data. This requires more work. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - │ 1 │ - └─────┘ - >>> df.select(pl.col("a").unique(maintain_order=True)) - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - └─────┘ - - ''' - def first(self) -> Self: - ''' - Get the first value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").first()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - └─────┘ - - ''' - def last(self) -> Self: - ''' - Get the last value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").last()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: - ''' - Compute expressions over the given groups. - - This expression is similar to performing a group by aggregation and joining the - result back into the original DataFrame. - - The outcome is similar to how `window functions - `_ - work in PostgreSQL. - - Parameters - ---------- - expr - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_exprs - Additional columns to group by, specified as positional arguments. - mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} - - group_to_rows - If the aggregation results in multiple values, assign them back to their - position in the DataFrame. This can only be done if the group yields - the same elements before aggregation as after. - - join - Join the groups as \'List\' to the row positions. - warning: this can be memory intensive. - - explode - Don\'t do any mapping, but simply flatten the group. - This only makes sense if the input data is sorted. - - Examples - -------- - Pass the name of a column to compute the expression over that column. - - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "a", "b", "b", "b"], - ... "b": [1, 2, 3, 5, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> df.with_columns( - ... pl.col("c").max().over("a").name.suffix("_max"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_max │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 5 │ - │ b ┆ 3 ┆ 3 ┆ 3 │ - │ b ┆ 5 ┆ 2 ┆ 3 │ - │ b ┆ 3 ┆ 1 ┆ 3 │ - └─────┴─────┴─────┴───────┘ - - Expression input is supported. - - >>> df.with_columns( - ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_max │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 4 │ - │ b ┆ 3 ┆ 3 ┆ 4 │ - │ b ┆ 5 ┆ 2 ┆ 2 │ - │ b ┆ 3 ┆ 1 ┆ 4 │ - └─────┴─────┴─────┴───────┘ - - Group by multiple columns by passing a list of column names or expressions. - - >>> df.with_columns( - ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_min │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 4 │ - │ b ┆ 3 ┆ 3 ┆ 1 │ - │ b ┆ 5 ┆ 2 ┆ 2 │ - │ b ┆ 3 ┆ 1 ┆ 1 │ - └─────┴─────┴─────┴───────┘ - - Or use positional arguments to group by multiple columns in the same way. - - >>> df.with_columns( - ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_min │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 4 │ - │ b ┆ 3 ┆ 3 ┆ 1 │ - │ b ┆ 5 ┆ 2 ┆ 1 │ - │ b ┆ 3 ┆ 1 ┆ 1 │ - └─────┴─────┴─────┴───────┘ - - ''' - def rolling(self, index_column: str) -> Self: - ''' - Create rolling groups based on a time, Int32, or Int64 column. - - If you have a time series ``, then by default the - windows created will be - - * (t_0 - period, t_0] - * (t_1 - period, t_1] - * ... - * (t_n - period, t_n] - - whereas if you pass a non-default `offset`, then the windows will be - - * (t_0 + offset, t_0 + offset + period] - * (t_1 + offset, t_1 + offset + period] - * ... - * (t_n + offset, t_n + offset + period] - - The `period` and `offset` arguments are created either from a timedelta, or - by using the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a rolling operation on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order. - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Examples - -------- - >>> dates = [ - ... "2020-01-01 13:45:48", - ... "2020-01-01 16:42:13", - ... "2020-01-01 16:45:09", - ... "2020-01-02 18:12:48", - ... "2020-01-03 19:45:32", - ... "2020-01-08 23:16:43", - ... ] - >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( - ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() - ... ) - >>> df.with_columns( - ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), - ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), - ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), - ... ) - shape: (6, 5) - ┌─────────────────────┬─────┬───────┬───────┬───────┐ - │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ - │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ - │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ - │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ - │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ - │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ - │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ - └─────────────────────┴─────┴───────┴───────┴───────┘ - - ''' - def is_unique(self) -> Self: - ''' - Get mask of unique values. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").is_unique()) - shape: (3, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ false │ - │ true │ - └───────┘ - - ''' - def is_first_distinct(self) -> Self: - ''' - Return a boolean mask indicating the first occurrence of each distinct value. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) - >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) - shape: (5, 2) - ┌─────┬───────┐ - │ a ┆ first │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪═══════╡ - │ 1 ┆ true │ - │ 1 ┆ false │ - │ 2 ┆ true │ - │ 3 ┆ true │ - │ 2 ┆ false │ - └─────┴───────┘ - - ''' - def is_last_distinct(self) -> Self: - ''' - Return a boolean mask indicating the last occurrence of each distinct value. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) - >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) - shape: (5, 2) - ┌─────┬───────┐ - │ a ┆ last │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪═══════╡ - │ 1 ┆ false │ - │ 1 ┆ true │ - │ 2 ┆ false │ - │ 3 ┆ true │ - │ 2 ┆ true │ - └─────┴───────┘ - - ''' - def is_duplicated(self) -> Self: - ''' - Return a boolean mask indicating duplicated values. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").is_duplicated()) - shape: (3, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ true │ - │ true │ - │ false │ - └───────┘ - - ''' - def peak_max(self) -> Self: - ''' - Get a boolean mask of the local maximum peaks. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) - >>> df.select(pl.col("a").peak_max()) - shape: (5, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ false │ - │ false │ - │ false │ - │ true │ - └───────┘ - - ''' - def peak_min(self) -> Self: - ''' - Get a boolean mask of the local minimum peaks. - - Examples - -------- - >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) - >>> df.select(pl.col("a").peak_min()) - shape: (5, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ true │ - │ false │ - │ true │ - │ false │ - └───────┘ - - ''' - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: - ''' - Get quantile value. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) - >>> df.select(pl.col("a").quantile(0.3)) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 2.0 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.5 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.5 │ - └─────┘ - - ''' - def cut(self, breaks: Sequence[float]) -> Self: - ''' - Bin continuous values into discrete categories. - - Parameters - ---------- - breaks - List of unique cut points. - labels - Names of the categories. The number of labels must be equal to the number - of cut points plus one. - left_closed - Set the intervals to be left-closed instead of right-closed. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - - Returns - ------- - Expr - Expression of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise an expression of data type :class:`Struct`. - - See Also - -------- - qcut - - Examples - -------- - Divide a column into three categories. - - >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) - >>> df.with_columns( - ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") - ... ) - shape: (5, 2) - ┌─────┬─────┐ - │ foo ┆ cut │ - │ --- ┆ --- │ - │ i64 ┆ cat │ - ╞═════╪═════╡ - │ -2 ┆ a │ - │ -1 ┆ a │ - │ 0 ┆ b │ - │ 1 ┆ b │ - │ 2 ┆ c │ - └─────┴─────┘ - - Add both the category and the breakpoint. - - >>> df.with_columns( - ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") - ... ).unnest("cut") - shape: (5, 3) - ┌─────┬──────┬────────────┐ - │ foo ┆ brk ┆ foo_bin │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪══════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴──────┴────────────┘ - - ''' - def qcut(self, quantiles: Sequence[float] | int) -> Self: - ''' - Bin continuous values into discrete categories based on their quantiles. - - Parameters - ---------- - quantiles - Either a list of quantile probabilities between 0 and 1 or a positive - integer determining the number of bins with uniform probability. - labels - Names of the categories. The number of labels must be equal to the number - of categories. - left_closed - Set the intervals to be left-closed instead of right-closed. - allow_duplicates - If set to `True`, duplicates in the resulting quantiles are dropped, - rather than raising a `DuplicateError`. This can happen even with unique - probabilities, depending on the data. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - - Returns - ------- - Expr - Expression of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise an expression of data type :class:`Struct`. - - See Also - -------- - cut - - Examples - -------- - Divide a column into three categories according to pre-defined quantile - probabilities. - - >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) - >>> df.with_columns( - ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") - ... ) - shape: (5, 2) - ┌─────┬──────┐ - │ foo ┆ qcut │ - │ --- ┆ --- │ - │ i64 ┆ cat │ - ╞═════╪══════╡ - │ -2 ┆ a │ - │ -1 ┆ a │ - │ 0 ┆ b │ - │ 1 ┆ b │ - │ 2 ┆ c │ - └─────┴──────┘ - - Divide a column into two categories using uniform quantile probabilities. - - >>> df.with_columns( - ... pl.col("foo") - ... .qcut(2, labels=["low", "high"], left_closed=True) - ... .alias("qcut") - ... ) - shape: (5, 2) - ┌─────┬──────┐ - │ foo ┆ qcut │ - │ --- ┆ --- │ - │ i64 ┆ cat │ - ╞═════╪══════╡ - │ -2 ┆ low │ - │ -1 ┆ low │ - │ 0 ┆ high │ - │ 1 ┆ high │ - │ 2 ┆ high │ - └─────┴──────┘ - - Add both the category and the breakpoint. - - >>> df.with_columns( - ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") - ... ).unnest("qcut") - shape: (5, 3) - ┌─────┬──────┬────────────┐ - │ foo ┆ brk ┆ foo_bin │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪══════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴──────┴────────────┘ - - ''' - def rle(self) -> Self: - ''' - Get the lengths of runs of identical values. - - Returns - ------- - Expr - Expression of data type :class:`Struct` with Fields "lengths" and "values". - - Examples - -------- - >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) - >>> df.select(pl.col("s").rle()).unnest("s") - shape: (6, 2) - ┌─────────┬────────┐ - │ lengths ┆ values │ - │ --- ┆ --- │ - │ i32 ┆ i64 │ - ╞═════════╪════════╡ - │ 2 ┆ 1 │ - │ 1 ┆ 2 │ - │ 1 ┆ 1 │ - │ 1 ┆ null │ - │ 1 ┆ 1 │ - │ 2 ┆ 3 │ - └─────────┴────────┘ - ''' - def rle_id(self) -> Self: - ''' - Map values to run IDs. - - Similar to RLE, but it maps each value to an ID corresponding to the run into - which it falls. This is especially useful when you want to define groups by - runs of identical values rather than the values themselves. - - - Examples - -------- - >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) - >>> # It works on structs of multiple values too! - >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) - shape: (5, 4) - ┌─────┬──────┬─────┬──────┐ - │ a ┆ b ┆ a_r ┆ ab_r │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ u32 ┆ u32 │ - ╞═════╪══════╪═════╪══════╡ - │ 1 ┆ x ┆ 0 ┆ 0 │ - │ 2 ┆ x ┆ 1 ┆ 1 │ - │ 1 ┆ null ┆ 2 ┆ 2 │ - │ 1 ┆ y ┆ 2 ┆ 3 │ - │ 1 ┆ y ┆ 2 ┆ 3 │ - └─────┴──────┴─────┴──────┘ - ''' - def filter(self, predicate: Expr) -> Self: - ''' - Filter a single column. - - The original order of the remaining elements is preserved. - - Mostly useful in an aggregation context. If you want to filter on a DataFrame - level, use `LazyFrame.filter`. - - Parameters - ---------- - predicate - Boolean expression. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group_col": ["g1", "g1", "g2"], - ... "b": [1, 2, 3], - ... } - ... ) - >>> df.group_by("group_col").agg( - ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), - ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), - ... ).sort("group_col") - shape: (2, 3) - ┌───────────┬─────┬─────┐ - │ group_col ┆ lt ┆ gte │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═══════════╪═════╪═════╡ - │ g1 ┆ 1 ┆ 2 │ - │ g2 ┆ 0 ┆ 3 │ - └───────────┴─────┴─────┘ - - ''' - def where(self, predicate: Expr) -> Self: - ''' - Filter a single column. - - Alias for :func:`filter`. - - Parameters - ---------- - predicate - Boolean expression. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group_col": ["g1", "g1", "g2"], - ... "b": [1, 2, 3], - ... } - ... ) - >>> df.group_by("group_col").agg( - ... [ - ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), - ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), - ... ] - ... ).sort("group_col") - shape: (2, 3) - ┌───────────┬─────┬─────┐ - │ group_col ┆ lt ┆ gte │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═══════════╪═════╪═════╡ - │ g1 ┆ 1 ┆ 2 │ - │ g2 ┆ 0 ┆ 3 │ - └───────────┴─────┴─────┘ - - ''' - def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: - ''' - Apply a custom python function to a whole Series or sequence of Series. - - The output of this custom function must be a Series. If you want to apply a - custom function elementwise over single values, see :func:`map_elements`. - A reasonable use case for `map` functions is transforming the values - represented by an expression using a third-party library. - - Read more in `the book - `_. - - Parameters - ---------- - function - Lambda/function to apply. - return_dtype - Dtype of the output Series. - agg_list - Aggregate list. - - Notes - ----- - If you are looking to map a function over a window function or group_by context, - refer to func:`map_elements` instead. - - Warnings - -------- - If `return_dtype` is not provided, this may lead to unexpected results. - We allow this, but it is considered a bug in the user\'s query. - - See Also - -------- - map_elements - replace - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "sine": [0.0, 1.0, 0.0, -1.0], - ... "cosine": [1.0, 0.0, -1.0, 0.0], - ... } - ... ) - >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) - shape: (1, 2) - ┌──────┬────────┐ - │ sine ┆ cosine │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪════════╡ - │ 1 ┆ 0 │ - └──────┴────────┘ - - ''' - def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - ''' - Map a custom/user-defined function (UDF) to each element of a column. - - .. warning:: - This method is much slower than the native expressions API. - Only use it if you cannot implement your logic otherwise. - - The UDF is applied to each element of a column. Note that, in a GroupBy - context, the column will have been pre-aggregated and so each element - will itself be a Series. Therefore, depending on the context, - requirements for `function` differ: - - * Selection - Expects `function` to be of type `Callable[[Any], Any]`. - Applies a Python function to each individual value in the column. - * GroupBy - Expects `function` to be of type `Callable[[Series], Any]`. - For each group, applies a Python function to the slice of the column - corresponding to that group. - - Parameters - ---------- - function - Lambda/function to map. - return_dtype - Dtype of the output Series. - If not set, the dtype will be `pl.Unknown`. - skip_nulls - Don\'t map the function over values that contain nulls (this is faster). - pass_name - Pass the Series name to the custom function (this is more expensive). - strategy : {\'thread_local\', \'threading\'} - This functionality is considered experimental and may be removed/changed. - - - \'thread_local\': run the python function on a single thread. - - \'threading\': run the python function on separate threads. Use with - care as this can slow performance. This might only speed up - your code if the amount of work per element is significant - and the python function releases the GIL (e.g. via calling - a c function) - - Notes - ----- - * Using `map_elements` is strongly discouraged as you will be effectively - running python "for" loops, which will be very slow. Wherever possible you - should prefer the native expression API to achieve the best performance. - - * If your function is expensive and you don\'t want it to be called more than - once for a given input, consider applying an `@lru_cache` decorator to it. - If your data is suitable you may achieve *significant* speedups. - - * Window function application using `over` is considered a GroupBy context - here, so `map_elements` can be used to map functions over window groups. - - Warnings - -------- - If `return_dtype` is not provided, this may lead to unexpected results. - We allow this, but it is considered a bug in the user\'s query. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["a", "b", "c", "c"], - ... } - ... ) - - The function is applied to each element of column `\'a\'`: - - >>> df.with_columns( # doctest: +SKIP - ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), - ... ) - shape: (4, 3) - ┌─────┬─────┬───────────┐ - │ a ┆ b ┆ a_times_2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 │ - ╞═════╪═════╪═══════════╡ - │ 1 ┆ a ┆ 2 │ - │ 2 ┆ b ┆ 4 │ - │ 3 ┆ c ┆ 6 │ - │ 1 ┆ c ┆ 2 │ - └─────┴─────┴───────────┘ - - Tip: it is better to implement this with an expression: - - >>> df.with_columns( - ... (pl.col("a") * 2).alias("a_times_2"), - ... ) # doctest: +IGNORE_RESULT - - In a GroupBy context, each element of the column is itself a Series: - - >>> ( - ... df.lazy().group_by("b").agg(pl.col("a")).collect() - ... ) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬───────────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════╪═══════════╡ - │ a ┆ [1] │ - │ b ┆ [2] │ - │ c ┆ [3, 1] │ - └─────┴───────────┘ - - Therefore, from the user\'s point-of-view, the function is applied per-group: - - >>> ( - ... df.lazy() - ... .group_by("b") - ... .agg(pl.col("a").map_elements(lambda x: x.sum())) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬─────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 1 │ - │ b ┆ 2 │ - │ c ┆ 4 │ - └─────┴─────┘ - - Tip: again, it is better to implement this with an expression: - - >>> ( - ... df.lazy() - ... .group_by("b", maintain_order=True) - ... .agg(pl.col("a").sum()) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - - Window function application using `over` will behave as a GroupBy - context, with your function receiving individual window groups: - - >>> df = pl.DataFrame( - ... { - ... "key": ["x", "x", "y", "x", "y", "z"], - ... "val": [1, 1, 1, 1, 1, 1], - ... } - ... ) - >>> df.with_columns( - ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), - ... ).sort("key") - shape: (6, 3) - ┌─────┬─────┬────────┐ - │ key ┆ val ┆ scaled │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪════════╡ - │ x ┆ 1 ┆ 3 │ - │ x ┆ 1 ┆ 3 │ - │ x ┆ 1 ┆ 3 │ - │ y ┆ 1 ┆ 2 │ - │ y ┆ 1 ┆ 2 │ - │ z ┆ 1 ┆ 1 │ - └─────┴─────┴────────┘ - - Note that this function would *also* be better-implemented natively: - - >>> df.with_columns( - ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), - ... ).sort( - ... "key" - ... ) # doctest: +IGNORE_RESULT - - ''' - def flatten(self) -> Self: - ''' - Flatten a list or string column. - - Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": ["a", "b", "b"], - ... "values": [[1, 2], [2, 3], [4]], - ... } - ... ) - >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ values │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪═══════════╡ - │ a ┆ [1, 2] │ - │ b ┆ [2, 3, 4] │ - └───────┴───────────┘ - - ''' - def explode(self) -> Self: - ''' - Explode a list expression. - - This means that every item is expanded to a new row. - - Returns - ------- - Expr - Expression with the data type of the list elements. - - See Also - -------- - Expr.list.explode : Explode a list column. - Expr.str.explode : Explode a string column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": ["a", "b"], - ... "values": [ - ... [1, 2], - ... [3, 4], - ... ], - ... } - ... ) - >>> df.select(pl.col("values").explode()) - shape: (4, 1) - ┌────────┐ - │ values │ - │ --- │ - │ i64 │ - ╞════════╡ - │ 1 │ - │ 2 │ - │ 3 │ - │ 4 │ - └────────┘ - - ''' - def implode(self) -> Self: - ''' - Aggregate values into a list. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": [4, 5, 6], - ... } - ... ) - >>> df.select(pl.all().implode()) - shape: (1, 2) - ┌───────────┬───────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ list[i64] ┆ list[i64] │ - ╞═══════════╪═══════════╡ - │ [1, 2, 3] ┆ [4, 5, 6] │ - └───────────┴───────────┘ - - ''' - def gather_every(self, n: int) -> Self: - ''' - Take every nth value in the Series and return as a new Series. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) - >>> df.select(pl.col("foo").gather_every(3)) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 4 │ - │ 7 │ - └─────┘ - - ''' - def head(self, n: int | Expr = ...) -> Self: - ''' - Get the first `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.head(3) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - ''' - def tail(self, n: int | Expr = ...) -> Self: - ''' - Get the last `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.tail(3) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 5 │ - │ 6 │ - │ 7 │ - └─────┘ - - ''' - def limit(self, n: int | Expr = ...) -> Self: - ''' - Get the first `n` rows (alias for :func:`Expr.head`). - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.limit(3) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - ''' - def and_(self, *others: Any) -> Self: - ''' - Method equivalent of bitwise "and" operator `expr & other & ...`. - - Parameters - ---------- - *others - One or more integer or boolean expressions to evaluate/combine. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5, 6, 7, 4, 8], - ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], - ... "z": [-9, 2, -1, 4, 8], - ... } - ... ) - >>> df.select( - ... (pl.col("x") >= pl.col("z")) - ... .and_( - ... pl.col("y") >= pl.col("z"), - ... pl.col("y") == pl.col("y"), - ... pl.col("z") <= pl.col("x"), - ... pl.col("y") != pl.col("x"), - ... ) - ... .alias("all") - ... ) - shape: (5, 1) - ┌───────┐ - │ all │ - │ --- │ - │ bool │ - ╞═══════╡ - │ true │ - │ true │ - │ true │ - │ false │ - │ false │ - └───────┘ - - ''' - def or_(self, *others: Any) -> Self: - ''' - Method equivalent of bitwise "or" operator `expr | other | ...`. - - Parameters - ---------- - *others - One or more integer or boolean expressions to evaluate/combine. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5, 6, 7, 4, 8], - ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], - ... "z": [-9, 2, -1, 4, 8], - ... } - ... ) - >>> df.select( - ... (pl.col("x") == pl.col("y")) - ... .or_( - ... pl.col("x") == pl.col("y"), - ... pl.col("y") == pl.col("z"), - ... pl.col("y").cast(int) == pl.col("z"), - ... ) - ... .alias("any") - ... ) - shape: (5, 1) - ┌───────┐ - │ any │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ true │ - │ false │ - │ true │ - │ false │ - └───────┘ - - ''' - def eq(self, other: Any) -> Self: - ''' - Method equivalent of equality operator `expr == other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0], - ... "y": [2.0, 2.0, float("nan"), 4.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").eq(pl.col("y")).alias("x == y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x == y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 1.0 ┆ 2.0 ┆ false │ - │ 2.0 ┆ 2.0 ┆ true │ - │ NaN ┆ NaN ┆ false │ - │ 4.0 ┆ 4.0 ┆ true │ - └─────┴─────┴────────┘ - - ''' - def eq_missing(self, other: Any) -> Self: - ''' - Method equivalent of equality operator `expr == other` where `None == None`. - - This differs from default `eq` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], - ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").eq(pl.col("y")).alias("x eq y"), - ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), - ... ) - shape: (6, 4) - ┌──────┬──────┬────────┬────────────────┐ - │ x ┆ y ┆ x eq y ┆ x eq_missing y │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪══════╪════════╪════════════════╡ - │ 1.0 ┆ 2.0 ┆ false ┆ false │ - │ 2.0 ┆ 2.0 ┆ true ┆ true │ - │ NaN ┆ NaN ┆ false ┆ false │ - │ 4.0 ┆ 4.0 ┆ true ┆ true │ - │ null ┆ 5.0 ┆ null ┆ false │ - │ null ┆ null ┆ null ┆ true │ - └──────┴──────┴────────┴────────────────┘ - - ''' - def ge(self, other: Any) -> Self: - ''' - Method equivalent of "greater than or equal" operator `expr >= other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5.0, 4.0, float("nan"), 2.0], - ... "y": [5.0, 3.0, float("nan"), 1.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").ge(pl.col("y")).alias("x >= y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x >= y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 5.0 ┆ 5.0 ┆ true │ - │ 4.0 ┆ 3.0 ┆ true │ - │ NaN ┆ NaN ┆ false │ - │ 2.0 ┆ 1.0 ┆ true │ - └─────┴─────┴────────┘ - - ''' - def gt(self, other: Any) -> Self: - ''' - Method equivalent of "greater than" operator `expr > other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5.0, 4.0, float("nan"), 2.0], - ... "y": [5.0, 3.0, float("nan"), 1.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").gt(pl.col("y")).alias("x > y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬───────┐ - │ x ┆ y ┆ x > y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪═══════╡ - │ 5.0 ┆ 5.0 ┆ false │ - │ 4.0 ┆ 3.0 ┆ true │ - │ NaN ┆ NaN ┆ false │ - │ 2.0 ┆ 1.0 ┆ true │ - └─────┴─────┴───────┘ - - ''' - def le(self, other: Any) -> Self: - ''' - Method equivalent of "less than or equal" operator `expr <= other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5.0, 4.0, float("nan"), 0.5], - ... "y": [5.0, 3.5, float("nan"), 2.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").le(pl.col("y")).alias("x <= y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x <= y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 5.0 ┆ 5.0 ┆ true │ - │ 4.0 ┆ 3.5 ┆ false │ - │ NaN ┆ NaN ┆ false │ - │ 0.5 ┆ 2.0 ┆ true │ - └─────┴─────┴────────┘ - - ''' - def lt(self, other: Any) -> Self: - ''' - Method equivalent of "less than" operator `expr < other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 3.0], - ... "y": [2.0, 2.0, float("nan"), 4.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").lt(pl.col("y")).alias("x < y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬───────┐ - │ x ┆ y ┆ x < y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪═══════╡ - │ 1.0 ┆ 2.0 ┆ true │ - │ 2.0 ┆ 2.0 ┆ false │ - │ NaN ┆ NaN ┆ false │ - │ 3.0 ┆ 4.0 ┆ true │ - └─────┴─────┴───────┘ - - ''' - def ne(self, other: Any) -> Self: - ''' - Method equivalent of inequality operator `expr != other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0], - ... "y": [2.0, 2.0, float("nan"), 4.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").ne(pl.col("y")).alias("x != y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x != y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 1.0 ┆ 2.0 ┆ true │ - │ 2.0 ┆ 2.0 ┆ false │ - │ NaN ┆ NaN ┆ true │ - │ 4.0 ┆ 4.0 ┆ false │ - └─────┴─────┴────────┘ - - ''' - def ne_missing(self, other: Any) -> Self: - ''' - Method equivalent of equality operator `expr != other` where `None == None`. - - This differs from default `ne` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], - ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").ne(pl.col("y")).alias("x ne y"), - ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), - ... ) - shape: (6, 4) - ┌──────┬──────┬────────┬────────────────┐ - │ x ┆ y ┆ x ne y ┆ x ne_missing y │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪══════╪════════╪════════════════╡ - │ 1.0 ┆ 2.0 ┆ true ┆ true │ - │ 2.0 ┆ 2.0 ┆ false ┆ false │ - │ NaN ┆ NaN ┆ true ┆ true │ - │ 4.0 ┆ 4.0 ┆ false ┆ false │ - │ null ┆ 5.0 ┆ null ┆ true │ - │ null ┆ null ┆ null ┆ false │ - └──────┴──────┴────────┴────────────────┘ - - ''' - def add(self, other: Any) -> Self: - ''' - Method equivalent of addition operator `expr + other`. - - Parameters - ---------- - other - numeric or string value; accepts expression input. - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) - >>> df.with_columns( - ... pl.col("x").add(2).alias("x+int"), - ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), - ... ) - shape: (5, 3) - ┌─────┬───────┬────────┐ - │ x ┆ x+int ┆ x+expr │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═══════╪════════╡ - │ 1 ┆ 3 ┆ 2 │ - │ 2 ┆ 4 ┆ 4 │ - │ 3 ┆ 5 ┆ 9 │ - │ 4 ┆ 6 ┆ 28 │ - │ 5 ┆ 7 ┆ 125 │ - └─────┴───────┴────────┘ - - >>> df = pl.DataFrame( - ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} - ... ) - >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) - shape: (3, 4) - ┌─────┬─────┬─────┬─────┐ - │ x ┆ y ┆ z ┆ xyz │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ str ┆ str │ - ╞═════╪═════╪═════╪═════╡ - │ a ┆ b ┆ c ┆ abc │ - │ d ┆ e ┆ f ┆ def │ - │ g ┆ h ┆ i ┆ ghi │ - └─────┴─────┴─────┴─────┘ - - ''' - def floordiv(self, other: Any) -> Self: - ''' - Method equivalent of integer division operator `expr // other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - See Also - -------- - truediv - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) - >>> df.with_columns( - ... pl.col("x").truediv(2).alias("x/2"), - ... pl.col("x").floordiv(2).alias("x//2"), - ... ) - shape: (5, 3) - ┌─────┬─────┬──────┐ - │ x ┆ x/2 ┆ x//2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ i64 │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 0.5 ┆ 0 │ - │ 2 ┆ 1.0 ┆ 1 │ - │ 3 ┆ 1.5 ┆ 1 │ - │ 4 ┆ 2.0 ┆ 2 │ - │ 5 ┆ 2.5 ┆ 2 │ - └─────┴─────┴──────┘ - - ''' - def mod(self, other: Any) -> Self: - ''' - Method equivalent of modulus operator `expr % other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) - >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) - shape: (5, 2) - ┌─────┬─────┐ - │ x ┆ x%2 │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 0 ┆ 0 │ - │ 1 ┆ 1 │ - │ 2 ┆ 0 │ - │ 3 ┆ 1 │ - │ 4 ┆ 0 │ - └─────┴─────┘ - - ''' - def mul(self, other: Any) -> Self: - ''' - Method equivalent of multiplication operator `expr * other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) - >>> df.with_columns( - ... pl.col("x").mul(2).alias("x*2"), - ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), - ... ) - shape: (5, 3) - ┌─────┬─────┬───────────┐ - │ x ┆ x*2 ┆ x * xlog2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ f64 │ - ╞═════╪═════╪═══════════╡ - │ 1 ┆ 2 ┆ 0.0 │ - │ 2 ┆ 4 ┆ 2.0 │ - │ 4 ┆ 8 ┆ 8.0 │ - │ 8 ┆ 16 ┆ 24.0 │ - │ 16 ┆ 32 ┆ 64.0 │ - └─────┴─────┴───────────┘ - - ''' - def sub(self, other: Any) -> Self: - ''' - Method equivalent of subtraction operator `expr - other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("x").sub(2).alias("x-2"), - ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), - ... ) - shape: (5, 3) - ┌─────┬─────┬────────┐ - │ x ┆ x-2 ┆ x-expr │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪════════╡ - │ 0 ┆ -2 ┆ 0 │ - │ 1 ┆ -1 ┆ 0 │ - │ 2 ┆ 0 ┆ -1 │ - │ 3 ┆ 1 ┆ -3 │ - │ 4 ┆ 2 ┆ -6 │ - └─────┴─────┴────────┘ - - ''' - def truediv(self, other: Any) -> Self: - ''' - Method equivalent of float division operator `expr / other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Notes - ----- - Zero-division behaviour follows IEEE-754: - - 0/0: Invalid operation - mathematically undefined, returns NaN. - n/0: On finite operands gives an exact infinite result, eg: ±infinity. - - See Also - -------- - floordiv - - Examples - -------- - >>> df = pl.DataFrame( - ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} - ... ) - >>> df.with_columns( - ... pl.col("x").truediv(2).alias("x/2"), - ... pl.col("x").truediv(pl.col("y")).alias("x/y"), - ... ) - shape: (5, 4) - ┌─────┬──────┬──────┬───────┐ - │ x ┆ y ┆ x/2 ┆ x/y │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ f64 ┆ f64 │ - ╞═════╪══════╪══════╪═══════╡ - │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ - │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ - │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ - │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ - │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ - └─────┴──────┴──────┴───────┘ - - ''' - def pow(self, exponent: int | float | None | Series | Expr) -> Self: - ''' - Method equivalent of exponentiation operator `expr ** exponent`. - - Parameters - ---------- - exponent - Numeric literal or expression exponent value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) - >>> df.with_columns( - ... pl.col("x").pow(3).alias("cube"), - ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), - ... ) - shape: (4, 3) - ┌─────┬───────┬────────────┐ - │ x ┆ cube ┆ x ** xlog2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ f64 │ - ╞═════╪═══════╪════════════╡ - │ 1 ┆ 1.0 ┆ 1.0 │ - │ 2 ┆ 8.0 ┆ 2.0 │ - │ 4 ┆ 64.0 ┆ 16.0 │ - │ 8 ┆ 512.0 ┆ 512.0 │ - └─────┴───────┴────────────┘ - - ''' - def xor(self, other: Any) -> Self: - ''' - Method equivalent of bitwise exclusive-or operator `expr ^ other`. - - Parameters - ---------- - other - Integer or boolean value; accepts expression input. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"x": [True, False, True, False], "y": [True, True, False, False]} - ... ) - >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) - shape: (4, 3) - ┌───────┬───────┬───────┐ - │ x ┆ y ┆ x ^ y │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞═══════╪═══════╪═══════╡ - │ true ┆ true ┆ false │ - │ false ┆ true ┆ true │ - │ true ┆ false ┆ true │ - │ false ┆ false ┆ false │ - └───────┴───────┴───────┘ - - >>> def binary_string(n: int) -> str: - ... return bin(n)[2:].zfill(8) - >>> - >>> df = pl.DataFrame( - ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, - ... schema={"x": pl.UInt8, "y": pl.UInt8}, - ... ) - >>> df.with_columns( - ... pl.col("x").map_elements(binary_string).alias("bin_x"), - ... pl.col("y").map_elements(binary_string).alias("bin_y"), - ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), - ... pl.col("x") - ... .xor(pl.col("y")) - ... .map_elements(binary_string) - ... .alias("bin_xor_xy"), - ... ) - shape: (4, 6) - ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ - │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ - ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ - │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ - │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ - │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ - │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ - └─────┴─────┴──────────┴──────────┴────────┴────────────┘ - - ''' - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: - ''' - Check if elements of this expression are present in the other Series. - - Parameters - ---------- - other - Series or sequence of primitive type. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} - ... ) - >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) - shape: (3, 3) - ┌───────────┬──────────────────┬──────────┐ - │ sets ┆ optional_members ┆ contains │ - │ --- ┆ --- ┆ --- │ - │ list[i64] ┆ i64 ┆ bool │ - ╞═══════════╪══════════════════╪══════════╡ - │ [1, 2, 3] ┆ 1 ┆ true │ - │ [1, 2] ┆ 2 ┆ true │ - │ [9, 10] ┆ 3 ┆ false │ - └───────────┴──────────────────┴──────────┘ - - ''' - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: - ''' - Repeat the elements in this Series as specified in the given expression. - - The repeated elements are expanded into a `List`. - - Parameters - ---------- - by - Numeric column that determines how often the values will be repeated. - The column will be coerced to UInt32. Give this dtype to make the coercion a - no-op. - - Returns - ------- - Expr - Expression of data type :class:`List`, where the inner data type is equal - to the original data type. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["x", "y", "z"], - ... "n": [1, 2, 3], - ... } - ... ) - >>> df.select(pl.col("a").repeat_by("n")) - shape: (3, 1) - ┌─────────────────┐ - │ a │ - │ --- │ - │ list[str] │ - ╞═════════════════╡ - │ ["x"] │ - │ ["y", "y"] │ - │ ["z", "z", "z"] │ - └─────────────────┘ - - ''' - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: - ''' - Check if this expression is between the given start and end values. - - Parameters - ---------- - lower_bound - Lower bound value. Accepts expression input. Strings are parsed as column - names, other non-expression inputs are parsed as literals. - upper_bound - Upper bound value. Accepts expression input. Strings are parsed as column - names, other non-expression inputs are parsed as literals. - closed : {\'both\', \'left\', \'right\', \'none\'} - Define which sides of the interval are closed (inclusive). - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) - >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) - shape: (5, 2) - ┌─────┬────────────┐ - │ num ┆ is_between │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪════════════╡ - │ 1 ┆ false │ - │ 2 ┆ true │ - │ 3 ┆ true │ - │ 4 ┆ true │ - │ 5 ┆ false │ - └─────┴────────────┘ - - Use the `closed` argument to include or exclude the values at the bounds: - - >>> df.with_columns( - ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") - ... ) - shape: (5, 2) - ┌─────┬────────────┐ - │ num ┆ is_between │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪════════════╡ - │ 1 ┆ false │ - │ 2 ┆ true │ - │ 3 ┆ true │ - │ 4 ┆ false │ - │ 5 ┆ false │ - └─────┴────────────┘ - - You can also use strings as well as numeric/temporal values (note: ensure that - string literals are wrapped with `lit` so as not to conflate them with - column names): - - >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) - >>> df.with_columns( - ... pl.col("a") - ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") - ... .alias("is_between") - ... ) - shape: (5, 2) - ┌─────┬────────────┐ - │ a ┆ is_between │ - │ --- ┆ --- │ - │ str ┆ bool │ - ╞═════╪════════════╡ - │ a ┆ true │ - │ b ┆ true │ - │ c ┆ true │ - │ d ┆ false │ - │ e ┆ false │ - └─────┴────────────┘ - - ''' - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: - ''' - Hash the elements in the selection. - - The hash value is of type `UInt64`. - - Parameters - ---------- - seed - Random seed parameter. Defaults to 0. - seed_1 - Random seed parameter. Defaults to `seed` if not set. - seed_2 - Random seed parameter. Defaults to `seed` if not set. - seed_3 - Random seed parameter. Defaults to `seed` if not set. - - Notes - ----- - This implementation of :func:`rows` does not guarantee stable results - across different Polars versions. Its stability is only guaranteed within a - single version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": ["x", None, "z"], - ... } - ... ) - >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌──────────────────────┬──────────────────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u64 ┆ u64 │ - ╞══════════════════════╪══════════════════════╡ - │ 9774092659964970114 ┆ 13614470193936745724 │ - │ 1101441246220388612 ┆ 11638928888656214026 │ - │ 11638928888656214026 ┆ 13382926553367784577 │ - └──────────────────────┴──────────────────────┘ - - ''' - def reinterpret(self) -> Self: - ''' - Reinterpret the underlying bits as a signed/unsigned integer. - - This operation is only allowed for 64bit integers. For lower bits integers, - you can safely use that cast operation. - - Parameters - ---------- - signed - If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. - - Examples - -------- - >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) - >>> df = pl.DataFrame([s]) - >>> df.select( - ... [ - ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), - ... pl.col("a").alias("original"), - ... ] - ... ) - shape: (3, 2) - ┌───────────────┬──────────┐ - │ reinterpreted ┆ original │ - │ --- ┆ --- │ - │ i64 ┆ u64 │ - ╞═══════════════╪══════════╡ - │ 1 ┆ 1 │ - │ 1 ┆ 1 │ - │ 2 ┆ 2 │ - └───────────────┴──────────┘ - - ''' - def inspect(self, fmt: str = ...) -> Self: - ''' - Print the value that this expression evaluates to and pass on the value. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 1, 2]}) - >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) - value is: shape: (3,) - Series: \'foo\' [i64] - [ - 1 - 2 - 4 - ] - shape: (3, 1) - ┌─────┐ - │ bar │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 4 │ - └─────┘ - - ''' - def interpolate(self, method: InterpolationMethod = ...) -> Self: - ''' - Fill null values using interpolation. - - Parameters - ---------- - method : {\'linear\', \'nearest\'} - Interpolation method. - - Examples - -------- - Fill null values using linear interpolation. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, None, 3], - ... "b": [1.0, float("nan"), 3.0], - ... } - ... ) - >>> df.select(pl.all().interpolate()) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 1.0 ┆ 1.0 │ - │ 2.0 ┆ NaN │ - │ 3.0 ┆ 3.0 │ - └─────┴─────┘ - - Fill null values using nearest interpolation. - - >>> df.select(pl.all().interpolate("nearest")) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪═════╡ - │ 1 ┆ 1.0 │ - │ 3 ┆ NaN │ - │ 3 ┆ 3.0 │ - └─────┴─────┘ - - Regrid data to a new grid. - - >>> df_original_grid = pl.DataFrame( - ... { - ... "grid_points": [1, 3, 10], - ... "values": [2.0, 6.0, 20.0], - ... } - ... ) # Interpolate from this to the new grid - >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) - >>> df_new_grid.join( - ... df_original_grid, on="grid_points", how="left" - ... ).with_columns(pl.col("values").interpolate()) - shape: (10, 2) - ┌─────────────┬────────┐ - │ grid_points ┆ values │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════════════╪════════╡ - │ 1 ┆ 2.0 │ - │ 2 ┆ 4.0 │ - │ 3 ┆ 6.0 │ - │ 4 ┆ 8.0 │ - │ … ┆ … │ - │ 7 ┆ 14.0 │ - │ 8 ┆ 16.0 │ - │ 9 ┆ 18.0 │ - │ 10 ┆ 20.0 │ - └─────────────┴────────┘ - - ''' - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling min (moving min) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their min. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic - temporal size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_min=pl.col("A").rolling_min(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_min │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 2.0 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ 4.0 │ - │ 6.0 ┆ 5.0 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_min=pl.col("A").rolling_min( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_min │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.25 │ - │ 3.0 ┆ 0.5 │ - │ 4.0 ┆ 0.75 │ - │ 5.0 ┆ 1.0 │ - │ 6.0 ┆ 1.25 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_min │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 2.0 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ 4.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - >>> df_temporal.with_columns( - ... rolling_row_min=pl.col("row_nr").rolling_min( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_min │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling max (moving max) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their max. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_max=pl.col("A").rolling_max(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_max │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 2.0 │ - │ 3.0 ┆ 3.0 │ - │ 4.0 ┆ 4.0 │ - │ 5.0 ┆ 5.0 │ - │ 6.0 ┆ 6.0 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_max=pl.col("A").rolling_max( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_max │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.25 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ 3.75 │ - │ 6.0 ┆ 4.5 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_max │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 3.0 │ - │ 3.0 ┆ 4.0 │ - │ 4.0 ┆ 5.0 │ - │ 5.0 ┆ 6.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling max with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_max=pl.col("row_nr").rolling_max( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_max │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling max with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_max=pl.col("row_nr").rolling_max( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_max │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling mean (moving mean) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their mean. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_mean=pl.col("A").rolling_mean(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬──────────────┐ - │ A ┆ rolling_mean │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.5 │ - │ 4.0 ┆ 3.5 │ - │ 5.0 ┆ 4.5 │ - │ 6.0 ┆ 5.5 │ - └─────┴──────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_mean=pl.col("A").rolling_mean( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────┐ - │ A ┆ rolling_mean │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.75 │ - │ 3.0 ┆ 2.75 │ - │ 4.0 ┆ 3.75 │ - │ 5.0 ┆ 4.75 │ - │ 6.0 ┆ 5.75 │ - └─────┴──────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬──────────────┐ - │ A ┆ rolling_mean │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 2.0 │ - │ 3.0 ┆ 3.0 │ - │ 4.0 ┆ 4.0 │ - │ 5.0 ┆ 5.0 │ - │ 6.0 ┆ null │ - └─────┴──────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling mean with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_mean=pl.col("row_nr").rolling_mean( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬──────────────────┐ - │ row_nr ┆ date ┆ rolling_row_mean │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪══════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ - └────────┴─────────────────────┴──────────────────┘ - - Compute the rolling mean with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_mean=pl.col("row_nr").rolling_mean( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬──────────────────┐ - │ row_nr ┆ date ┆ rolling_row_mean │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪══════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ - └────────┴─────────────────────┴──────────────────┘ - - ''' - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling sum (moving sum) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their sum. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - of dtype `{Date, Datetime}` - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_sum=pl.col("A").rolling_sum(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_sum │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 3.0 │ - │ 3.0 ┆ 5.0 │ - │ 4.0 ┆ 7.0 │ - │ 5.0 ┆ 9.0 │ - │ 6.0 ┆ 11.0 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_sum=pl.col("A").rolling_sum( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_sum │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.75 │ - │ 3.0 ┆ 2.75 │ - │ 4.0 ┆ 3.75 │ - │ 5.0 ┆ 4.75 │ - │ 6.0 ┆ 5.75 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_sum │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 6.0 │ - │ 3.0 ┆ 9.0 │ - │ 4.0 ┆ 12.0 │ - │ 5.0 ┆ 15.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling sum with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_sum=pl.col("row_nr").rolling_sum( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_sum │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling sum with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_sum=pl.col("row_nr").rolling_sum( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_sum │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling standard deviation. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` means - the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_std=pl.col("A").rolling_std(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_std │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.707107 │ - │ 3.0 ┆ 0.707107 │ - │ 4.0 ┆ 0.707107 │ - │ 5.0 ┆ 0.707107 │ - │ 6.0 ┆ 0.707107 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_std=pl.col("A").rolling_std( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_std │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.433013 │ - │ 3.0 ┆ 0.433013 │ - │ 4.0 ┆ 0.433013 │ - │ 5.0 ┆ 0.433013 │ - │ 6.0 ┆ 0.433013 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_std │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 1.0 │ - │ 4.0 ┆ 1.0 │ - │ 5.0 ┆ 1.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling std with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_std=pl.col("row_nr").rolling_std( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_std │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling std with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_std=pl.col("row_nr").rolling_std( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_std │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling variance. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_var=pl.col("A").rolling_var(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_var │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.5 │ - │ 3.0 ┆ 0.5 │ - │ 4.0 ┆ 0.5 │ - │ 5.0 ┆ 0.5 │ - │ 6.0 ┆ 0.5 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_var=pl.col("A").rolling_var( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_var │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.1875 │ - │ 3.0 ┆ 0.1875 │ - │ 4.0 ┆ 0.1875 │ - │ 5.0 ┆ 0.1875 │ - │ 6.0 ┆ 0.1875 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_var │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 1.0 │ - │ 4.0 ┆ 1.0 │ - │ 5.0 ┆ 1.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling var with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_var=pl.col("row_nr").rolling_var( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_var │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling var with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_var=pl.col("row_nr").rolling_var( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_var │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling median. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` means - the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_median=pl.col("A").rolling_median(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬────────────────┐ - │ A ┆ rolling_median │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.5 │ - │ 4.0 ┆ 3.5 │ - │ 5.0 ┆ 4.5 │ - │ 6.0 ┆ 5.5 │ - └─────┴────────────────┘ - - Specify weights for the values in each window: - - >>> df.with_columns( - ... rolling_median=pl.col("A").rolling_median( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬────────────────┐ - │ A ┆ rolling_median │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.5 │ - │ 4.0 ┆ 3.5 │ - │ 5.0 ┆ 4.5 │ - │ 6.0 ┆ 5.5 │ - └─────┴────────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬────────────────┐ - │ A ┆ rolling_median │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 2.0 │ - │ 3.0 ┆ 3.0 │ - │ 4.0 ┆ 4.0 │ - │ 5.0 ┆ 5.0 │ - │ 6.0 ┆ null │ - └─────┴────────────────┘ - - ''' - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling quantile. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - window_size - The length of the window. Can be a fixed integer size, or a dynamic - temporal size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.25, window_size=4 - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ null │ - │ 4.0 ┆ 2.0 │ - │ 5.0 ┆ 3.0 │ - │ 6.0 ┆ 4.0 │ - └─────┴──────────────────┘ - - Specify weights for the values in each window: - - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ null │ - │ 4.0 ┆ 2.0 │ - │ 5.0 ┆ 3.0 │ - │ 6.0 ┆ 4.0 │ - └─────┴──────────────────┘ - - Specify weights and interpolation method - - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.25, - ... window_size=4, - ... weights=[0.2, 0.4, 0.4, 0.2], - ... interpolation="linear", - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ null │ - │ 4.0 ┆ 1.625 │ - │ 5.0 ┆ 2.625 │ - │ 6.0 ┆ 3.625 │ - └─────┴──────────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.2, window_size=5, center=True - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ 2.0 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ null │ - │ 6.0 ┆ null │ - └─────┴──────────────────┘ - - ''' - def rolling_skew(self, window_size: int) -> Self: - ''' - Compute a rolling skew. - - The window at a given row includes the row itself and the - `window_size - 1` elements before it. - - Parameters - ---------- - window_size - Integer size of the rolling window. - bias - If False, the calculations are corrected for statistical bias. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) - >>> df.select(pl.col("a").rolling_skew(3)) - shape: (4, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ null │ - │ null │ - │ 0.381802 │ - │ 0.47033 │ - └──────────┘ - - Note how the values match the following: - - >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() - (0.38180177416060584, 0.47033046033698594) - - ''' - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a custom rolling window function. - - .. warning:: - Computing custom functions is extremely slow. Use specialized rolling - functions such as :func:`Expr.rolling_sum` if at all possible. - - Parameters - ---------- - function - Custom aggregation function. - window_size - Size of the window. The window at a given row will include the row - itself and the `window_size - 1` elements before it. - weights - A list of weights with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window. - - Examples - -------- - >>> from numpy import nansum - >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) - >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) - shape: (5, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ null │ - │ null │ - │ 22.0 │ - │ 11.0 │ - │ 17.0 │ - └──────┘ - - ''' - def abs(self) -> Self: - ''' - Compute absolute values. - - Same as `abs(expr)`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [-1.0, 0.0, 1.0, 2.0], - ... } - ... ) - >>> df.select(pl.col("A").abs()) - shape: (4, 1) - ┌─────┐ - │ A │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - │ 0.0 │ - │ 1.0 │ - │ 2.0 │ - └─────┘ - - ''' - def rank(self, method: RankMethod = ...) -> Self: - ''' - Assign ranks to data, dealing with ties appropriately. - - Parameters - ---------- - method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} - The method used to assign ranks to tied elements. - The following methods are available (default is \'average\'): - - - \'average\' : The average of the ranks that would have been assigned to - all the tied values is assigned to each value. - - \'min\' : The minimum of the ranks that would have been assigned to all - the tied values is assigned to each value. (This is also referred to - as "competition" ranking.) - - \'max\' : The maximum of the ranks that would have been assigned to all - the tied values is assigned to each value. - - \'dense\' : Like \'min\', but the rank of the next highest element is - assigned the rank immediately after those assigned to the tied - elements. - - \'ordinal\' : All values are given a distinct rank, corresponding to - the order that the values occur in the Series. - - \'random\' : Like \'ordinal\', but the rank for ties is not dependent - on the order that the values occur in the Series. - descending - Rank in descending order. - seed - If `method="random"`, use this as seed. - - Examples - -------- - The \'average\' method: - - >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) - >>> df.select(pl.col("a").rank()) - shape: (5, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 3.0 │ - │ 4.5 │ - │ 1.5 │ - │ 1.5 │ - │ 4.5 │ - └─────┘ - - The \'ordinal\' method: - - >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) - >>> df.select(pl.col("a").rank("ordinal")) - shape: (5, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 3 │ - │ 4 │ - │ 1 │ - │ 2 │ - │ 5 │ - └─────┘ - - Use \'rank\' with \'over\' to rank within groups: - - >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) - >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) - shape: (5, 3) - ┌─────┬─────┬──────┐ - │ a ┆ b ┆ rank │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ f64 │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 6 ┆ 1.0 │ - │ 1 ┆ 7 ┆ 2.0 │ - │ 2 ┆ 5 ┆ 1.0 │ - │ 2 ┆ 14 ┆ 3.0 │ - │ 2 ┆ 11 ┆ 2.0 │ - └─────┴─────┴──────┘ - - ''' - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: - ''' - Calculate the first discrete difference between shifted items. - - Parameters - ---------- - n - Number of slots to shift. - null_behavior : {\'ignore\', \'drop\'} - How to handle null values. - - Examples - -------- - >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) - >>> df.with_columns(change=pl.col("int").diff()) - shape: (5, 2) - ┌─────┬────────┐ - │ int ┆ change │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪════════╡ - │ 20 ┆ null │ - │ 10 ┆ -10 │ - │ 30 ┆ 20 │ - │ 25 ┆ -5 │ - │ 35 ┆ 10 │ - └─────┴────────┘ - - >>> df.with_columns(change=pl.col("int").diff(n=2)) - shape: (5, 2) - ┌─────┬────────┐ - │ int ┆ change │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪════════╡ - │ 20 ┆ null │ - │ 10 ┆ null │ - │ 30 ┆ 10 │ - │ 25 ┆ 15 │ - │ 35 ┆ 5 │ - └─────┴────────┘ - - >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) - shape: (3, 1) - ┌──────┐ - │ diff │ - │ --- │ - │ i64 │ - ╞══════╡ - │ 10 │ - │ 15 │ - │ 5 │ - └──────┘ - - ''' - def pct_change(self, n: int | IntoExprColumn = ...) -> Self: - ''' - Computes percentage change between values. - - Percentage change (as fraction) between current element and most-recent - non-null element at least `n` period(s) before the current element. - - Computes the change from the previous row by default. - - Parameters - ---------- - n - periods to shift for forming percent change. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [10, 11, 12, None, 12], - ... } - ... ) - >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) - shape: (5, 2) - ┌──────┬────────────┐ - │ a ┆ pct_change │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞══════╪════════════╡ - │ 10 ┆ null │ - │ 11 ┆ 0.1 │ - │ 12 ┆ 0.090909 │ - │ null ┆ 0.0 │ - │ 12 ┆ 0.0 │ - └──────┴────────────┘ - - ''' - def skew(self) -> Self: - ''' - Compute the sample skewness of a data set. - - For normally distributed data, the skewness should be about zero. For - unimodal continuous distributions, a skewness value greater than zero means - that there is more weight in the right tail of the distribution. The - function `skewtest` can be used to determine if the skewness value - is close enough to zero, statistically speaking. - - - See scipy.stats for more information. - - Parameters - ---------- - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - Notes - ----- - The sample skewness is computed as the Fisher-Pearson coefficient - of skewness, i.e. - - .. math:: g_1=\\frac{m_3}{m_2^{3/2}} - - where - - .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i - - is the biased sample :math:`i\\texttt{th}` central moment, and - :math:`\\bar{x}` is - the sample mean. If `bias` is False, the calculations are - corrected for bias and the value computed is the adjusted - Fisher-Pearson standardized moment coefficient, i.e. - - .. math:: - G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").skew()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.343622 │ - └──────────┘ - - ''' - def kurtosis(self) -> Self: - ''' - Compute the kurtosis (Fisher or Pearson) of a dataset. - - Kurtosis is the fourth central moment divided by the square of the - variance. If Fisher\'s definition is used, then 3.0 is subtracted from - the result to give 0.0 for a normal distribution. - If bias is False then the kurtosis is calculated using k statistics to - eliminate bias coming from biased moment estimators. - - See scipy.stats for more information - - Parameters - ---------- - fisher : bool, optional - If True, Fisher\'s definition is used (normal ==> 0.0). If False, - Pearson\'s definition is used (normal ==> 3.0). - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").kurtosis()) - shape: (1, 1) - ┌───────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═══════════╡ - │ -1.153061 │ - └───────────┘ - - ''' - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: - ''' - Set values outside the given boundaries to the boundary value. - - Parameters - ---------- - lower_bound - Lower bound. Accepts expression input. - Non-expression inputs are parsed as literals. - upper_bound - Upper bound. Accepts expression input. - Non-expression inputs are parsed as literals. - - See Also - -------- - when - - Notes - ----- - This method only works for numeric and temporal columns. To clip other data - types, consider writing a `when-then-otherwise` expression. See :func:`when`. - - Examples - -------- - Specifying both a lower and upper bound: - - >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) - >>> df.with_columns(clip=pl.col("a").clip(1, 10)) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ clip │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ -50 ┆ 1 │ - │ 5 ┆ 5 │ - │ 50 ┆ 10 │ - │ null ┆ null │ - └──────┴──────┘ - - Specifying only a single bound: - - >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ clip │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ -50 ┆ -50 │ - │ 5 ┆ 5 │ - │ 50 ┆ 10 │ - │ null ┆ null │ - └──────┴──────┘ - - ''' - def lower_bound(self) -> Self: - ''' - Calculate the lower bound. - - Returns a unit Series with the lowest value possible for the dtype of this - expression. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").lower_bound()) - shape: (1, 1) - ┌──────────────────────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════════════════════╡ - │ -9223372036854775808 │ - └──────────────────────┘ - - ''' - def upper_bound(self) -> Self: - ''' - Calculate the upper bound. - - Returns a unit Series with the highest value possible for the dtype of this - expression. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").upper_bound()) - shape: (1, 1) - ┌─────────────────────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════════════════════╡ - │ 9223372036854775807 │ - └─────────────────────┘ - - ''' - def sign(self) -> Self: - ''' - Compute the element-wise indication of the sign. - - The returned values can be -1, 0, or 1: - - * -1 if x < 0. - * 0 if x == 0. - * 1 if x > 0. - - (null values are preserved as-is). - - Examples - -------- - >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) - >>> df.select(pl.col("a").sign()) - shape: (5, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ -1 │ - │ 0 │ - │ 0 │ - │ 1 │ - │ null │ - └──────┘ - - ''' - def sin(self) -> Self: - ''' - Compute the element-wise value for the sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.0]}) - >>> df.select(pl.col("a").sin()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def cos(self) -> Self: - ''' - Compute the element-wise value for the cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.0]}) - >>> df.select(pl.col("a").cos()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def tan(self) -> Self: - ''' - Compute the element-wise value for the tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").tan().round(2)) - shape: (1, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ 1.56 │ - └──────┘ - - ''' - def cot(self) -> Self: - ''' - Compute the element-wise value for the cotangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").cot().round(2)) - shape: (1, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ 0.64 │ - └──────┘ - - ''' - def arcsin(self) -> Self: - ''' - Compute the element-wise value for the inverse sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arcsin()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.570796 │ - └──────────┘ - - ''' - def arccos(self) -> Self: - ''' - Compute the element-wise value for the inverse cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.0]}) - >>> df.select(pl.col("a").arccos()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.570796 │ - └──────────┘ - - ''' - def arctan(self) -> Self: - ''' - Compute the element-wise value for the inverse tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arctan()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.785398 │ - └──────────┘ - - ''' - def sinh(self) -> Self: - ''' - Compute the element-wise value for the hyperbolic sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").sinh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.175201 │ - └──────────┘ - - ''' - def cosh(self) -> Self: - ''' - Compute the element-wise value for the hyperbolic cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").cosh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.543081 │ - └──────────┘ - - ''' - def tanh(self) -> Self: - ''' - Compute the element-wise value for the hyperbolic tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").tanh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.761594 │ - └──────────┘ - - ''' - def arcsinh(self) -> Self: - ''' - Compute the element-wise value for the inverse hyperbolic sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arcsinh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.881374 │ - └──────────┘ - - ''' - def arccosh(self) -> Self: - ''' - Compute the element-wise value for the inverse hyperbolic cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arccosh()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def arctanh(self) -> Self: - ''' - Compute the element-wise value for the inverse hyperbolic tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arctanh()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ inf │ - └─────┘ - - ''' - def degrees(self) -> Self: - ''' - Convert from radians to degrees. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> import math - >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) - >>> df.select(pl.col("a").degrees()) - shape: (9, 1) - ┌────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞════════╡ - │ -720.0 │ - │ -540.0 │ - │ -360.0 │ - │ -180.0 │ - │ 0.0 │ - │ 180.0 │ - │ 360.0 │ - │ 540.0 │ - │ 720.0 │ - └────────┘ - ''' - def radians(self) -> Self: - ''' - Convert from degrees to radians. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) - >>> df.select(pl.col("a").radians()) - shape: (9, 1) - ┌────────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞════════════╡ - │ -12.566371 │ - │ -9.424778 │ - │ -6.283185 │ - │ -3.141593 │ - │ 0.0 │ - │ 3.141593 │ - │ 6.283185 │ - │ 9.424778 │ - │ 12.566371 │ - └────────────┘ - ''' - def reshape(self, dimensions: tuple[int, ...]) -> Self: - ''' - Reshape this Expr to a flat Series or a Series of Lists. - - Parameters - ---------- - dimensions - Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that - dimension is inferred. - - Returns - ------- - Expr - If a single dimension is given, results in an expression of the original - data type. - If a multiple dimensions are given, results in an expression of data type - :class:`List` with shape (rows, cols). - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) - >>> df.select(pl.col("foo").reshape((3, 3))) - shape: (3, 1) - ┌───────────┐ - │ foo │ - │ --- │ - │ list[i64] │ - ╞═══════════╡ - │ [1, 2, 3] │ - │ [4, 5, 6] │ - │ [7, 8, 9] │ - └───────────┘ - - See Also - -------- - Expr.list.explode : Explode a list column. - - ''' - def shuffle(self, seed: int | None = ...) -> Self: - ''' - Shuffle the contents of this expression. - - Parameters - ---------- - seed - Seed for the random number generator. If set to None (default), a - random seed is generated each time the shuffle is called. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").shuffle(seed=1)) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - │ 1 │ - │ 3 │ - └─────┘ - - ''' - def sample(self, n: int | IntoExprColumn | None = ...) -> Self: - ''' - Sample from this expression. - - Parameters - ---------- - n - Number of items to return. Cannot be used with `fraction`. Defaults to 1 if - `fraction` is None. - fraction - Fraction of items to return. Cannot be used with `n`. - with_replacement - Allow values to be sampled more than once. - shuffle - Shuffle the order of sampled data points. - seed - Seed for the random number generator. If set to None (default), a - random seed is generated for each sample operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 3 │ - │ 1 │ - │ 1 │ - └─────┘ - - ''' - def ewm_mean(self) -> Self: - ''' - Exponentially-weighted moving average. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").ewm_mean(com=1)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.0 │ - │ 1.666667 │ - │ 2.428571 │ - └──────────┘ - - ''' - def ewm_std(self) -> Self: - ''' - Exponentially-weighted moving standard deviation. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").ewm_std(com=1)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.0 │ - │ 0.707107 │ - │ 0.963624 │ - └──────────┘ - - ''' - def ewm_var(self) -> Self: - ''' - Exponentially-weighted moving variance. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").ewm_var(com=1)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.0 │ - │ 0.5 │ - │ 0.928571 │ - └──────────┘ - - ''' - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: - ''' - Extremely fast method for extending the Series with \'n\' copies of a value. - - Parameters - ---------- - value - A constant literal value (not an expression) with which to extend the - expression result Series; can pass None to extend with nulls. - n - The number of additional values that will be added. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1, 2, 3]}) - >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) - shape: (5, 1) - ┌────────┐ - │ values │ - │ --- │ - │ i64 │ - ╞════════╡ - │ 0 │ - │ 1 │ - │ 2 │ - │ 99 │ - │ 99 │ - └────────┘ - - ''' - def value_counts(self) -> Self: - ''' - Count the occurrences of unique values. - - Parameters - ---------- - sort - Sort the output by count in descending order. - If set to `False` (default), the order of the output is random. - parallel - Execute the computation in parallel. - - .. note:: - This option should likely not be enabled in a group by context, - as the computation is already parallelized per group. - - Returns - ------- - Expr - Expression of data type :class:`Struct` with mapping of unique values to - their count. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} - ... ) - >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT - shape: (3, 1) - ┌─────────────┐ - │ color │ - │ --- │ - │ struct[2] │ - ╞═════════════╡ - │ {"red",2} │ - │ {"green",1} │ - │ {"blue",3} │ - └─────────────┘ - - Sort the output by count. - - >>> df.select(pl.col("color").value_counts(sort=True)) - shape: (3, 1) - ┌─────────────┐ - │ color │ - │ --- │ - │ struct[2] │ - ╞═════════════╡ - │ {"blue",3} │ - │ {"red",2} │ - │ {"green",1} │ - └─────────────┘ - - ''' - def unique_counts(self) -> Self: - ''' - Return a count of the unique values in the order of appearance. - - This method differs from `value_counts` in that it does not return the - values, only the counts and might be faster - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "id": ["a", "b", "b", "c", "c", "c"], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("id").unique_counts(), - ... ] - ... ) - shape: (3, 1) - ┌─────┐ - │ id │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - ''' - def log(self, base: float = ...) -> Self: - ''' - Compute the logarithm to a given base. - - Parameters - ---------- - base - Given base, defaults to `e` - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").log(base=2)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.0 │ - │ 1.0 │ - │ 1.584963 │ - └──────────┘ - - ''' - def log1p(self) -> Self: - ''' - Compute the natural logarithm of each element plus one. - - This computes `log(1 + x)` but is more numerically stable for `x` close to zero. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").log1p()) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.693147 │ - │ 1.098612 │ - │ 1.386294 │ - └──────────┘ - - ''' - def entropy(self, base: float = ...) -> Self: - ''' - Computes the entropy. - - Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. - - Parameters - ---------- - base - Given base, defaults to `e` - normalize - Normalize pk if it doesn\'t sum to 1. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").entropy(base=2)) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.459148 │ - └──────────┘ - >>> df.select(pl.col("a").entropy(base=2, normalize=False)) - shape: (1, 1) - ┌───────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═══════════╡ - │ -6.754888 │ - └───────────┘ - - ''' - def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: - ''' - Run an expression over a sliding window that increases `1` slot every iteration. - - Parameters - ---------- - expr - Expression to evaluate - min_periods - Number of valid values there should be in the window before the expression - is evaluated. valid values = `length - null_count` - parallel - Run in parallel. Don\'t do this in a group by or another operation that - already has much parallelization. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - This can be really slow as it can have `O(n^2)` complexity. Don\'t use this - for operations that visit all elements. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) - >>> df.select( - ... [ - ... pl.col("values").cumulative_eval( - ... pl.element().first() - pl.element().last() ** 2 - ... ) - ... ] - ... ) - shape: (5, 1) - ┌────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞════════╡ - │ 0.0 │ - │ -3.0 │ - │ -8.0 │ - │ -15.0 │ - │ -24.0 │ - └────────┘ - - ''' - def set_sorted(self) -> Self: - ''' - Flags the expression as \'sorted\'. - - Enables downstream code to user fast paths for sorted arrays. - - Parameters - ---------- - descending - Whether the `Series` order is descending. - - Warnings - -------- - This can lead to incorrect results if this `Series` is not sorted!! - Use with care! - - Examples - -------- - >>> df = pl.DataFrame({"values": [1, 2, 3]}) - >>> df.select(pl.col("values").set_sorted().max()) - shape: (1, 1) - ┌────────┐ - │ values │ - │ --- │ - │ i64 │ - ╞════════╡ - │ 3 │ - └────────┘ - - ''' - def shrink_dtype(self) -> Self: - ''' - Shrink numeric columns to the minimal required datatype. - - Shrink to the dtype needed to fit the extrema of this [`Series`]. - This can be used to reduce memory pressure. - - Examples - -------- - >>> pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": [1, 2, 2 << 32], - ... "c": [-1, 2, 1 << 30], - ... "d": [-112, 2, 112], - ... "e": [-112, 2, 129], - ... "f": ["a", "b", "c"], - ... "g": [0.1, 1.32, 0.12], - ... "h": [True, None, False], - ... } - ... ).select(pl.all().shrink_dtype()) - shape: (3, 8) - ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ - ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ - │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ - │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ - │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ - └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ - - ''' - def cache(self) -> Self: - """ - Cache this expression so that it only is executed once per context. - - .. deprecated:: 0.18.9 - This method now does nothing. It has been superseded by the - `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically - caches expressions that are equal. - - """ - def replace(self, mapping: dict[Any, Any]) -> Self: - ''' - Replace values according to the given mapping. - - Needs a global string cache for lazily evaluated queries on columns of - type `Categorical`. - - Parameters - ---------- - mapping - Mapping of values to their replacement. - default - Value to use when the mapping does not contain the lookup value. - Defaults to keeping the original value. Accepts expression input. - Non-expression inputs are parsed as literals. - return_dtype - Set return dtype to override automatic return dtype determination. - - See Also - -------- - str.replace - - Examples - -------- - Replace a single value by another value. Values not in the mapping remain - unchanged. - - >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) - >>> df.with_columns(pl.col("a").replace({2: 100}).alias("replaced")) - shape: (4, 2) - ┌─────┬──────────┐ - │ a ┆ replaced │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════════╡ - │ 1 ┆ 1 │ - │ 2 ┆ 100 │ - │ 2 ┆ 100 │ - │ 3 ┆ 3 │ - └─────┴──────────┘ - - Replace multiple values. Specify a default to set values not in the given map - to the default value. - - >>> df = pl.DataFrame({"country_code": ["FR", "ES", "DE", None]}) - >>> country_code_map = { - ... "CA": "Canada", - ... "DE": "Germany", - ... "FR": "France", - ... None: "unspecified", - ... } - >>> df.with_columns( - ... pl.col("country_code") - ... .replace(country_code_map, default=None) - ... .alias("replaced") - ... ) - shape: (4, 2) - ┌──────────────┬─────────────┐ - │ country_code ┆ replaced │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞══════════════╪═════════════╡ - │ FR ┆ France │ - │ ES ┆ null │ - │ DE ┆ Germany │ - │ null ┆ unspecified │ - └──────────────┴─────────────┘ - - The return type can be overridden with the `return_dtype` argument. - - >>> df = df.with_row_count() - >>> df.select( - ... "row_nr", - ... pl.col("row_nr") - ... .replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) - ... .alias("replaced"), - ... ) - shape: (4, 2) - ┌────────┬──────────┐ - │ row_nr ┆ replaced │ - │ --- ┆ --- │ - │ u32 ┆ u8 │ - ╞════════╪══════════╡ - │ 0 ┆ 0 │ - │ 1 ┆ 10 │ - │ 2 ┆ 20 │ - │ 3 ┆ 0 │ - └────────┴──────────┘ - - To reference other columns as a `default` value, a struct column must be - constructed first. The first field must be the column in which values are - replaced. The other columns can be used in the default expression. - - >>> df.with_columns( - ... pl.struct("country_code", "row_nr") - ... .replace( - ... mapping=country_code_map, - ... default=pl.col("row_nr").cast(pl.Utf8), - ... ) - ... .alias("replaced") - ... ) - shape: (4, 3) - ┌────────┬──────────────┬─────────────┐ - │ row_nr ┆ country_code ┆ replaced │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ str ┆ str │ - ╞════════╪══════════════╪═════════════╡ - │ 0 ┆ FR ┆ France │ - │ 1 ┆ ES ┆ 1 │ - │ 2 ┆ DE ┆ Germany │ - │ 3 ┆ null ┆ unspecified │ - └────────┴──────────────┴─────────────┘ - ''' - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: - """ - Apply a custom python function to a Series or sequence of Series. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Expr.map_batches`. - - Parameters - ---------- - function - Lambda/ function to apply. - return_dtype - Dtype of the output Series. - agg_list - Aggregate list - - """ - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - """ - Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Expr.map_elements`. - - Parameters - ---------- - function - Lambda/ function to apply. - return_dtype - Dtype of the output Series. - If not set, the dtype will be - `polars.Unknown`. - skip_nulls - Don't apply the function over values - that contain nulls. This is faster. - pass_name - Pass the Series name to the custom function - This is more expensive. - strategy : {'thread_local', 'threading'} - This functionality is in `alpha` stage. This may be removed - /changed without it being considered a breaking change. - - - 'thread_local': run the python function on a single thread. - - 'threading': run the python function on separate threads. Use with - care as this can slow performance. This might only speed up - your code if the amount of work per element is significant - and the python function releases the GIL (e.g. via calling - a c function) - - """ - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - """ - Apply a custom rolling window function. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Expr.rolling_map`. - - Parameters - ---------- - function - Aggregation function - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - """ - def is_first(self) -> Self: - """ - Return a boolean mask indicating the first occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Expr.is_first_distinct`. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - """ - def is_last(self) -> Self: - """ - Return a boolean mask indicating the last occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Expr.is_last_distinct`. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - """ - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: - """ - Clip (limit) the values in an array to a `min` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - lower_bound - Lower bound. - - """ - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: - """ - Clip (limit) the values in an array to a `max` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - upper_bound - Upper bound. - - """ - def shift_and_fill(self, fill_value: IntoExpr) -> Self: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - Fill None values with the result of this expression. - n - Number of places to shift (may be negative). - - """ - def register_plugin(self) -> Self: - """ - Register a shared library as a plugin. - - .. warning:: - This is highly unsafe as this will call the C function - loaded by `lib::symbol`. - - The parameters you give dictate how polars will deal - with the function. Make sure they are correct! - - .. note:: - This functionality is unstable and may change without it - being considered breaking. - - Parameters - ---------- - lib - Library to load. - symbol - Function to load. - args - Arguments (other than self) passed to this function. - These arguments have to be of type Expression. - kwargs - Non-expression arguments. They must be JSON serializable. - is_elementwise - If the function only operates on scalars - this will trigger fast paths. - input_wildcard_expansion - Expand expressions as input of this function. - returns_scalar - Automatically explode on unit length if it ran as final aggregation. - this is the case for aggregations like `sum`, `min`, `covariance` etc. - cast_to_supertypes - Cast the input datatypes to their supertype. - pass_name_to_apply - if set, then the `Series` passed to the function in the group_by operation - will ensure the name is set. This is an extra heap allocation per group. - changes_length - For example a `unique` or a `slice` - - """ - def _register_plugin(self) -> Self: ... - def take_every(self, n: int) -> Self: - """ - Take every nth value in the Series and return as a new Series. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: - """ - Take values by index. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather`. - - Parameters - ---------- - indices - An expression that leads to a UInt32 dtyped Series. - """ - def cumsum(self) -> Self: - """ - Get an array with the cumulative sum computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_sum`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cumprod(self) -> Self: - """ - Get an array with the cumulative product computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_prod`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cummin(self) -> Self: - """ - Get an array with the cumulative min computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_min`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cummax(self) -> Self: - """ - Get an array with the cumulative max computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_max`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cumcount(self) -> Self: - """ - Get an array with the cumulative count computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_count`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def map_dict(self, mapping: dict[Any, Any]) -> Self: - """ - Replace values in column according to remapping dictionary. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`replace`. The default behavior - has changed to keep any values not present in the mapping unchanged. - Pass `default=None` to keep existing behavior. - - Parameters - ---------- - mapping - Dictionary containing the before/after values to map. - default - Value to use when the remapping dict does not contain the lookup value. - Accepts expression input. Non-expression inputs are parsed as literals. - Use `pl.first()`, to keep the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - - """ - @property - def bin(self): ... - @property - def cat(self): ... - @property - def dt(self): ... - @property - def list(self): ... - @property - def arr(self): ... - @property - def meta(self): ... - @property - def name(self): ... - @property - def str(self): ... - @property - def struct(self): ... -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: - """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/expr/expr.pyi similarity index 99% rename from polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/expr/expr rename to polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/expr/expr.pyi index 5131d44..88fed42 100644 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.15/polars/expr/expr +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/expr/expr.pyi @@ -1,3 +1,4 @@ +#: version 0.19.19 import P import np as np import pl diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/lazyframe/frame deleted file mode 100644 index 561f5b2..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/lazyframe/frame +++ /dev/null @@ -1,4211 +0,0 @@ -import P -import np -import pa -from builtins import PyLazyFrame -from pathlib import Path -from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 -from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype -from polars.dependencies import dataframe_api_compat as dataframe_api_compat -from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud -from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy -from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath -from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence - -TYPE_CHECKING: bool -DTYPE_TEMPORAL_UNITS: frozenset -N_INFER_DEFAULT: int - -class LazyFrame: - _accessors: _ClassVar[set] = ... - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - @classmethod - def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: - """ - Lazily read from a CSV file or multiple files via glob patterns. - - Use `pl.scan_csv` to dispatch to this method. - - See Also - -------- - polars.io.scan_csv - - """ - @classmethod - def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: - """ - Lazily read from a parquet file or multiple files via glob patterns. - - Use `pl.scan_parquet` to dispatch to this method. - - See Also - -------- - polars.io.scan_parquet - - """ - @classmethod - def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: - """ - Lazily read from an Arrow IPC (Feather v2) file. - - Use `pl.scan_ipc` to dispatch to this method. - - See Also - -------- - polars.io.scan_ipc - - """ - @classmethod - def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: - """ - Lazily read from a newline delimited JSON file. - - Use `pl.scan_ndjson` to dispatch to this method. - - See Also - -------- - polars.io.scan_ndjson - - """ - @classmethod - def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: - """ - Read a logical plan from a JSON string to construct a LazyFrame. - - .. deprecated:: 0.18.12 - This method is deprecated. Convert the JSON string to `StringIO` - and then use `LazyFrame.deserialize`. - - Parameters - ---------- - json - String in JSON format. - - See Also - -------- - deserialize - - """ - @classmethod - def read_json(cls, source: str | Path | IOBase) -> Self: - """ - Read a logical plan from a JSON file to construct a LazyFrame. - - .. deprecated:: 0.18.12 - This class method has been renamed to `deserialize`. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - - See Also - -------- - deserialize - - """ - @classmethod - def deserialize(cls, source: str | Path | IOBase) -> Self: - ''' - Read a logical plan from a JSON file to construct a LazyFrame. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - - See Also - -------- - LazyFrame.serialize - - Examples - -------- - >>> import io - >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() - >>> json = lf.serialize() - >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 6 │ - └─────┘ - - ''' - def __dataframe_consortium_standard__(self) -> Any: - """ - Provide entry point to the Consortium DataFrame Standard API. - - This is developed and maintained outside of polars. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. - """ - def __bool__(self) -> NoReturn: ... - def _comparison_error(self, operator: str) -> NoReturn: ... - def __eq__(self, other: Any) -> NoReturn: ... - def __ne__(self, other: Any) -> NoReturn: ... - def __gt__(self, other: Any) -> NoReturn: ... - def __lt__(self, other: Any) -> NoReturn: ... - def __ge__(self, other: Any) -> NoReturn: ... - def __le__(self, other: Any) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def _repr_html_(self) -> str: ... - def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: - ''' - Serialize the logical plan of this LazyFrame to a file or string in JSON format. - - Parameters - ---------- - file - File path to which the result should be written. If set to `None` - (default), the output is returned as a string instead. - - See Also - -------- - LazyFrame.deserialize - - Examples - -------- - Serialize the logical plan into a JSON string. - - >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() - >>> json = lf.serialize() - >>> json - \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' - - The logical plan can later be deserialized back into a LazyFrame. - - >>> import io - >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 6 │ - └─────┘ - - ''' - def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: - """ - Serialize the logical plan of this LazyFrame to a file or string in JSON format. - - .. deprecated:: 0.18.12 - This method has been renamed to :func:`LazyFrame.serialize`. - - Parameters - ---------- - file - File path to which the result should be written. If set to `None` - (default), the output is returned as a string instead. - """ - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: - ''' - Offers a structured way to apply a sequence of user-defined functions (UDFs). - - Parameters - ---------- - function - Callable; will receive the frame as the first parameter, - followed by any given args/kwargs. - *args - Arguments to pass to the UDF. - **kwargs - Keyword arguments to pass to the UDF. - - Examples - -------- - >>> def cast_str_to_int(data, col_name): - ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) - ... - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": ["10", "20", "30", "40"], - ... } - ... ) - >>> lf.pipe(cast_str_to_int, col_name="b").collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 10 │ - │ 2 ┆ 20 │ - │ 3 ┆ 30 │ - │ 4 ┆ 40 │ - └─────┴─────┘ - - >>> lf = pl.LazyFrame( - ... { - ... "b": [1, 2], - ... "a": [3, 4], - ... } - ... ) - >>> lf.collect() - shape: (2, 2) - ┌─────┬─────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - └─────┴─────┘ - >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 1 │ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def explain(self) -> str: - ''' - Create a string representation of the query plan. - - Different optimizations can be turned on or off. - - Parameters - ---------- - optimized - Return an optimized query plan. Defaults to `True`. - If this is set to `True` the subsequent - optimization flags control which optimizations - run. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( - ... "a" - ... ).explain() # doctest: +SKIP - ''' - def show_graph(self) -> str | None: - ''' - Show a plot of the query plan. Note that you should have graphviz installed. - - Parameters - ---------- - optimized - Optimize the query plan. - show - Show the figure. - output_path - Write the figure to disk. - raw_output - Return dot syntax. This cannot be combined with `show` and/or `output_path`. - figsize - Passed to matplotlib if `show` == True. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( - ... "a" - ... ).show_graph() # doctest: +SKIP - - ''' - def inspect(self, fmt: str = ...) -> Self: - ''' - Inspect a node in the computation graph. - - Print the value that this node in the computation graph evaluates to and passes - on the value. - - Examples - -------- - >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) - >>> ( - ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) - ... .inspect() # print the node before the filter - ... .filter(pl.col("bar") == pl.col("foo")) - ... ) # doctest: +ELLIPSIS - - - ''' - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: - ''' - Sort the DataFrame by the given columns. - - Parameters - ---------- - by - Column(s) to sort by. Accepts expression input. Strings are parsed as column - names. - *more_by - Additional columns to sort by, specified as positional arguments. - descending - Sort in descending order. When sorting by multiple columns, can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - Examples - -------- - Pass a single column name to sort by that column. - - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, None], - ... "b": [6.0, 5.0, 4.0], - ... "c": ["a", "c", "b"], - ... } - ... ) - >>> lf.sort("a").collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - Sorting by expressions is also supported. - - >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - └──────┴─────┴─────┘ - - Sort by multiple columns by passing a list of columns. - - >>> lf.sort(["c", "a"], descending=True).collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - └──────┴─────┴─────┘ - - Or use positional arguments to sort by multiple columns in the same way. - - >>> lf.sort("c", "a", descending=[False, True]).collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - ''' - def top_k(self, k: int) -> Self: - ''' - Return the `k` largest elements. - - If \'descending=True` the smallest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might - be worse since this requires a stable search. - - See Also - -------- - bottom_k - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 largest values in column b. - - >>> lf.top_k(4, by="b").collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ a ┆ 2 │ - │ b ┆ 2 │ - │ b ┆ 1 │ - └─────┴─────┘ - - Get the rows which contain the 4 largest values when sorting on column b and a. - - >>> lf.top_k(4, by=["b", "a"]).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 2 │ - │ c ┆ 1 │ - └─────┴─────┘ - - ''' - def bottom_k(self, k: int) -> Self: - ''' - Return the `k` smallest elements. - - If \'descending=True` the largest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - See Also - -------- - top_k - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 smallest values in column b. - - >>> lf.bottom_k(4, by="b").collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 1 │ - │ a ┆ 1 │ - │ c ┆ 1 │ - │ a ┆ 2 │ - └─────┴─────┘ - - Get the rows which contain the 4 smallest values when sorting on column a and b. - - >>> lf.bottom_k(4, by=["a", "b"]).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ b ┆ 1 │ - │ b ┆ 2 │ - └─────┴─────┘ - - ''' - def profile(self) -> tuple[DataFrame, DataFrame]: - ''' - Profile a LazyFrame. - - This will run the query and return a tuple - containing the materialized DataFrame and a DataFrame that - contains profiling information of each node that is executed. - - The units of the timings are microseconds. - - Parameters - ---------- - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off (certain) optimizations. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - show_plot - Show a gantt chart of the profiling result - truncate_nodes - Truncate the label lengths in the gantt chart to this number of - characters. - figsize - matplotlib figsize of the profiling plot - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( - ... "a" - ... ).profile() # doctest: +SKIP - (shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘, - shape: (3, 3) - ┌─────────────────────────┬───────┬──────┐ - │ node ┆ start ┆ end │ - │ --- ┆ --- ┆ --- │ - │ str ┆ u64 ┆ u64 │ - ╞═════════════════════════╪═══════╪══════╡ - │ optimization ┆ 0 ┆ 5 │ - │ group_by_partitioned(a) ┆ 5 ┆ 470 │ - │ sort(a) ┆ 475 ┆ 1964 │ - └─────────────────────────┴───────┴──────┘) - - ''' - def collect(self) -> DataFrame: - ''' - Materialize this LazyFrame into a DataFrame. - - By default, all query optimizations are enabled. Individual optimizations may - be disabled by setting the corresponding parameter to `False`. - - Parameters - ---------- - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - no_optimization - Turn off (certain) optimizations. - streaming - Process the query in batches to handle larger-than-memory data. - If set to `False` (default), the entire query is processed in a single - batch. - - .. warning:: - This functionality is currently in an alpha state. - - .. note:: - Use :func:`explain` to see if Polars can process the query in streaming - mode. - - Returns - ------- - DataFrame - - See Also - -------- - fetch: Run the query on the first `n` rows only for debugging purposes. - explain : Print the query plan that is evaluated with collect. - profile : Collect the LazyFrame and time each node in the computation graph. - polars.collect_all : Collect multiple LazyFrames at the same time. - polars.Config.set_streaming_chunk_size : Set the size of streaming batches. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘ - - Collect in streaming mode - - >>> lf.group_by("a").agg(pl.all().sum()).collect( - ... streaming=True - ... ) # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘ - - ''' - def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: - ''' - Collect DataFrame asynchronously in thread pool. - - Collects into a DataFrame (like :func:`collect`), but instead of returning - DataFrame directly, they are scheduled to be collected inside thread pool, - while this method returns almost instantly. - - May be useful if you use gevent or asyncio and want to release control to other - greenlets/tasks while LazyFrames are being collected. - - Parameters - ---------- - gevent - Return wrapper to `gevent.event.AsyncResult` instead of Awaitable - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off (certain) optimizations. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Notes - ----- - In case of error `set_exception` is used on - `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - See Also - -------- - polars.collect_all : Collect multiple LazyFrames at the same time. - polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. - - Returns - ------- - If `gevent=False` (default) then returns awaitable. - - If `gevent=True` then returns wrapper that has - `.get(block=True, timeout=None)` method. - - Examples - -------- - >>> import asyncio - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> async def main(): - ... return await ( - ... lf.group_by("a", maintain_order=True) - ... .agg(pl.all().sum()) - ... .collect_async() - ... ) - ... - >>> asyncio.run(main()) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘ - ''' - def sink_parquet(self, path: str | Path) -> DataFrame: - ''' - Evaluate the query in streaming mode and write to a Parquet file. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} - Choose "zstd" for good compression performance. - Choose "lz4" for fast compression/decompression. - Choose "snappy" for more backwards compatibility guarantees - when you deal with older parquet readers. - compression_level - The level of compression to use. Higher compression means smaller files on - disk. - - - "gzip" : min-level: 0, max-level: 10. - - "brotli" : min-level: 0, max-level: 11. - - "zstd" : min-level: 1, max-level: 22. - statistics - Write statistics to the parquet headers. This requires extra compute. - row_group_size - Size of the row groups in number of rows. - If None (default), the chunks of the `DataFrame` are - used. Writing in smaller chunks may reduce memory pressure and improve - writing speeds. - data_pagesize_limit - Size limit of individual data pages. - If not set defaults to 1024 * 1024 bytes - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - no_optimization - Turn off (certain) optimizations. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_parquet("out.parquet") # doctest: +SKIP - - ''' - def sink_ipc(self, path: str | Path) -> DataFrame: - ''' - Evaluate the query in streaming mode and write to an IPC file. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - compression : {\'lz4\', \'zstd\'} - Choose "zstd" for good compression performance. - Choose "lz4" for fast compression/decompression. - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - no_optimization - Turn off (certain) optimizations. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_ipc("out.arrow") # doctest: +SKIP - - ''' - def sink_csv(self, path: str | Path) -> DataFrame: - ''' - Evaluate the query in streaming mode and write to a CSV file. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - include_bom - Whether to include UTF-8 BOM in the CSV output. - include_header - Whether to include header in the CSV output. - separator - Separate CSV fields with this symbol. - line_terminator - String used to end each row. - quote_char - Byte to use as quoting character. - batch_size - Number of rows that will be processed per thread. - datetime_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. If no format specified, the default fractional-second - precision is inferred from the maximum timeunit found in the frame\'s - Datetime cols (if any). - date_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - time_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - float_precision - Number of decimal places to write, applied to both `Float32` and - `Float64` datatypes. - null_value - A string representing null values (defaulting to the empty string). - quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} - Determines the quoting strategy used. - - - necessary (default): This puts quotes around fields only when necessary. - They are necessary when fields contain a quote, - delimiter or record terminator. - Quotes are also necessary when writing an empty record - (which is indistinguishable from a record with one empty field). - This is the default. - - always: This puts quotes around every field. Always. - - never: This never puts quotes around fields, even if that results in - invalid CSV data (e.g.: by not quoting strings containing the - separator). - - non_numeric: This puts quotes around all fields that are non-numeric. - Namely, when writing a field that does not parse as a valid float - or integer, then quotes will be used even if they aren`t strictly - necessary. - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - no_optimization - Turn off (certain) optimizations. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_csv("out.csv") # doctest: +SKIP - - ''' - def sink_ndjson(self, path: str | Path) -> DataFrame: - ''' - Persists a LazyFrame at the provided path. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off (certain) optimizations. - slice_pushdown - Slice pushdown optimization. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_json("out.json") # doctest: +SKIP - - ''' - def _set_sink_optimizations(self) -> PyLazyFrame: ... - def fetch(self, n_rows: int = ...) -> DataFrame: - ''' - Collect a small number of rows for debugging purposes. - - Parameters - ---------- - n_rows - Collect n_rows from the data sources. - type_coercion - Run type coercion optimization. - predicate_pushdown - Run predicate pushdown optimization. - projection_pushdown - Run projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off optimizations. - slice_pushdown - Slice pushdown optimization - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Notes - ----- - This is similar to a :func:`collect` operation, but it overwrites the number of - rows read by *every* scan operation. Be aware that `fetch` does not guarantee - the final number of rows in the DataFrame. Filters, join operations and fewer - rows being available in the scanned data will all influence the final number - of rows (joins are especially susceptible to this, and may return no data - at all if `n_rows` is too small as the join keys may not be present). - - Warnings - -------- - This is strictly a utility function that can help to debug queries using a - smaller number of rows, and should *not* be used in production code. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 6 │ - │ b ┆ 2 ┆ 5 │ - └─────┴─────┴─────┘ - - ''' - def lazy(self) -> Self: - ''' - Return lazy representation, i.e. itself. - - Useful for writing code that expects either a :class:`DataFrame` or - :class:`LazyFrame`. - - Returns - ------- - LazyFrame - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> lf.lazy() # doctest: +ELLIPSIS - - - ''' - def cache(self) -> Self: - """Cache the result once the execution of the physical plan hits this node.""" - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: - ''' - Cast LazyFrame column(s) to the specified dtype(s). - - Parameters - ---------- - dtypes - Mapping of column names (or selector) to dtypes, or a single dtype - to which all columns will be cast. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> from datetime import date - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], - ... } - ... ) - - Cast specific frame columns to the specified dtypes: - - >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ u8 ┆ date │ - ╞═════╪═════╪════════════╡ - │ 1.0 ┆ 6 ┆ 2020-01-02 │ - │ 2.0 ┆ 7 ┆ 2021-03-04 │ - │ 3.0 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - Cast all frame columns to the specified dtype: - - >>> lf.cast(pl.Utf8).collect().to_dict(as_series=False) - {\'foo\': [\'1\', \'2\', \'3\'], - \'bar\': [\'6.0\', \'7.0\', \'8.0\'], - \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} - - Use selectors to define the columns being cast: - - >>> import polars.selectors as cs - >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ str │ - ╞═════╪═════╪════════════╡ - │ 1 ┆ 6 ┆ 2020-01-02 │ - │ 2 ┆ 7 ┆ 2021-03-04 │ - │ 3 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - ''' - def clear(self, n: int = ...) -> LazyFrame: - ''' - Create an empty copy of the current LazyFrame, with zero to \'n\' rows. - - Returns a copy with an identical schema but no data. - - Parameters - ---------- - n - Number of (empty) rows to return in the cleared frame. - - See Also - -------- - clone : Cheap deepcopy/clone. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> lf.clear().fetch() - shape: (0, 3) - ┌─────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞═════╪═════╪══════╡ - └─────┴─────┴──────┘ - - >>> lf.clear(2).fetch() - shape: (2, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪══════╪══════╡ - │ null ┆ null ┆ null │ - │ null ┆ null ┆ null │ - └──────┴──────┴──────┘ - - ''' - def clone(self) -> Self: - ''' - Create a copy of this LazyFrame. - - This is a cheap operation that does not copy data. - - See Also - -------- - clear : Create an empty copy of the current LazyFrame, with identical - schema but no data. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> lf.clone() # doctest: +ELLIPSIS - - - ''' - def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: - ''' - Filter the rows in the LazyFrame based on a predicate expression. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - predicates - Expression that evaluates to a boolean Series. - constraints - Column filters. Use name=value to filter column name by the supplied value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - - Filter on one condition: - - >>> lf.filter(pl.col("foo") > 1).collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Filter on multiple conditions: - - >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Provide multiple filters using `*args` syntax: - - >>> lf.filter( - ... pl.col("foo") == 1, - ... pl.col("ham") == "a", - ... ).collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Provide multiple filters using `**kwargs` syntax: - - >>> lf.filter(foo=1, ham="a").collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Filter on an OR condition: - - >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - ''' - Select columns from this LazyFrame. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Examples - -------- - Pass the name of a column to select that column. - - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.select("foo").collect() - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - Multiple columns can be selected by passing a list of column names. - - >>> lf.select(["foo", "bar"]).collect() - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 6 │ - │ 2 ┆ 7 │ - │ 3 ┆ 8 │ - └─────┴─────┘ - - Multiple columns can also be selected using positional arguments instead of a - list. Expressions are also accepted. - - >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - └─────┴─────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> lf.select( - ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) - ... ).collect() - shape: (3, 1) - ┌───────────┐ - │ threshold │ - │ --- │ - │ i32 │ - ╞═══════════╡ - │ 0 │ - │ 0 │ - │ 10 │ - └───────────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... lf.select( - ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), - ... ).collect() - ... - shape: (3, 1) - ┌───────────┐ - │ is_odd │ - │ --- │ - │ struct[2] │ - ╞═══════════╡ - │ {1,0} │ - │ {0,1} │ - │ {1,0} │ - └───────────┘ - - ''' - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - """ - Select columns from this LazyFrame. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - See Also - -------- - select - - """ - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: - ''' - Start a group by operation. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Setting this to `True` blocks the possibility - to run on the streaming engine. - - Examples - -------- - Group by one column and call `agg` to compute the grouped sum of another - column. - - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "c"], - ... "b": [1, 2, 1, 3, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 2 │ - │ b ┆ 5 │ - │ c ┆ 3 │ - └─────┴─────┘ - - Set `maintain_order=True` to ensure the order of the groups is consistent with - the input. - - >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() - shape: (3, 2) - ┌─────┬───────────┐ - │ a ┆ c │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════╪═══════════╡ - │ a ┆ [5, 3] │ - │ b ┆ [4, 2] │ - │ c ┆ [1] │ - └─────┴───────────┘ - - Group by multiple columns by passing a list of column names. - - >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - - Or use positional arguments to group by multiple columns in the same way. - Expressions are also accepted. - - >>> lf.group_by("a", pl.col("b") // 2).agg( - ... pl.col("c").mean() - ... ).collect() # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ f64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 0 ┆ 4.0 │ - │ b ┆ 1 ┆ 3.0 │ - │ c ┆ 1 ┆ 1.0 │ - └─────┴─────┴─────┘ - - ''' - def rolling(self, index_column: IntoExpr) -> LazyGroupBy: - ''' - Create rolling groups based on a time, Int32, or Int64 column. - - Different from a `dynamic_group_by` the windows are now determined by the - individual values and are not of constant intervals. For constant intervals - use :func:`LazyFrame.group_by_dynamic`. - - If you have a time series ``, then by default the - windows created will be - - * (t_0 - period, t_0] - * (t_1 - period, t_1] - * ... - * (t_n - period, t_n] - - whereas if you pass a non-default `offset`, then the windows will be - - * (t_0 + offset, t_0 + offset + period] - * (t_1 + offset, t_1 + offset + period] - * ... - * (t_n + offset, t_n + offset + period] - - The `period` and `offset` arguments are created either from a timedelta, or - by using the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a rolling operation on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - group_by_dynamic - - Examples - -------- - >>> dates = [ - ... "2020-01-01 13:45:48", - ... "2020-01-01 16:42:13", - ... "2020-01-01 16:45:09", - ... "2020-01-02 18:12:48", - ... "2020-01-03 19:45:32", - ... "2020-01-08 23:16:43", - ... ] - >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( - ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() - ... ) - >>> out = ( - ... df.rolling(index_column="dt", period="2d") - ... .agg( - ... pl.sum("a").alias("sum_a"), - ... pl.min("a").alias("min_a"), - ... pl.max("a").alias("max_a"), - ... ) - ... .collect() - ... ) - >>> out - shape: (6, 4) - ┌─────────────────────┬───────┬───────┬───────┐ - │ dt ┆ sum_a ┆ min_a ┆ max_a │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞═════════════════════╪═══════╪═══════╪═══════╡ - │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ - │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ - │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ - │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ - │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ - │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ - └─────────────────────┴───────┴───────┴───────┘ - - ''' - def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - Time windows are calculated and rows are assigned to windows. Different from a - normal group by is that a row can be member of multiple groups. - By default, the windows look like: - - - [start, start + period) - - [start + every, start + every + period) - - [start + 2*every, start + 2*every + period) - - ... - - where `start` is determined by `start_by`, `offset`, and `every` (see parameter - descriptions below). - - .. warning:: - The index column must be sorted in ascending order. If `by` is passed, then - the index column must be sorted in ascending order within each group. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - - .. deprecated:: 0.19.4 - Use `label` instead. - include_boundaries - Add the lower and upper bound of the window to the "_lower_boundary" and - "_upper_boundary" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - label : {\'left\', \'right\', \'datapoint\'} - Define which label to use for the window: - - - \'left\': lower boundary of the window - - \'right\': upper boundary of the window - - \'datapoint\': the first value of the index column in the given window. - If you don\'t need the label to be at one of the boundaries, choose this - option for maximum performance - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - rolling - - Notes - ----- - 1) If you\'re coming from pandas, then - - .. code-block:: python - - # polars - df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) - - is equivalent to - - .. code-block:: python - - # pandas - df.set_index("ts").resample("D")["value"].sum().reset_index() - - though note that, unlike pandas, polars doesn\'t add extra rows for empty - windows. If you need `index_column` to be evenly spaced, then please combine - with :func:`DataFrame.upsample`. - - 2) The `every`, `period` and `offset` arguments are created with - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a group_by_dynamic on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Examples - -------- - >>> from datetime import datetime - >>> lf = pl.LazyFrame( - ... { - ... "time": pl.datetime_range( - ... start=datetime(2021, 12, 16), - ... end=datetime(2021, 12, 16, 3), - ... interval="30m", - ... eager=True, - ... ), - ... "n": range(7), - ... } - ... ) - >>> lf.collect() - shape: (7, 2) - ┌─────────────────────┬─────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i64 │ - ╞═════════════════════╪═════╡ - │ 2021-12-16 00:00:00 ┆ 0 │ - │ 2021-12-16 00:30:00 ┆ 1 │ - │ 2021-12-16 01:00:00 ┆ 2 │ - │ 2021-12-16 01:30:00 ┆ 3 │ - │ 2021-12-16 02:00:00 ┆ 4 │ - │ 2021-12-16 02:30:00 ┆ 5 │ - │ 2021-12-16 03:00:00 ┆ 6 │ - └─────────────────────┴─────┘ - - Group by windows of 1 hour starting at 2021-12-16 00:00:00. - - >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( - ... pl.col("n") - ... ).collect() - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [1, 2] │ - │ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ 2021-12-16 02:00:00 ┆ [5, 6] │ - └─────────────────────┴───────────┘ - - The window boundaries can also be added to the aggregation result - - >>> lf.group_by_dynamic( - ... "time", every="1h", include_boundaries=True, closed="right" - ... ).agg(pl.col("n").mean()).collect() - shape: (4, 4) - ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ - │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ - ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ - │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ - │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ - │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ - │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ - └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ - - When closed="left", the window excludes the right end of interval: - [lower_bound, upper_bound) - - >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( - ... pl.col("n") - ... ).collect() - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-16 00:00:00 ┆ [0, 1] │ - │ 2021-12-16 01:00:00 ┆ [2, 3] │ - │ 2021-12-16 02:00:00 ┆ [4, 5] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - When closed="both" the time values at the window boundaries belong to 2 groups. - - >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( - ... pl.col("n") - ... ).collect() - shape: (5, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ - │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - Dynamic group bys can also be combined with grouping on normal keys - - >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) - >>> lf.collect() - shape: (7, 3) - ┌─────────────────────┬─────┬────────┐ - │ time ┆ n ┆ groups │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ str │ - ╞═════════════════════╪═════╪════════╡ - │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ - │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ - │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ - │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ - │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ - │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ - │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ - └─────────────────────┴─────┴────────┘ - >>> lf.group_by_dynamic( - ... "time", - ... every="1h", - ... closed="both", - ... by="groups", - ... include_boundaries=True, - ... ).agg(pl.col("n")).collect() - shape: (7, 5) - ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ - │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ - ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ - │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ - │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ - │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ - │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ - │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ - └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ - - Dynamic group by on an index column - - >>> lf = pl.LazyFrame( - ... { - ... "idx": pl.int_range(0, 6, eager=True), - ... "A": ["A", "A", "B", "B", "B", "C"], - ... } - ... ) - >>> lf.group_by_dynamic( - ... "idx", - ... every="2i", - ... period="3i", - ... include_boundaries=True, - ... closed="right", - ... ).agg(pl.col("A").alias("A_agg_list")).collect() - shape: (4, 4) - ┌─────────────────┬─────────────────┬─────┬─────────────────┐ - │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 ┆ list[str] │ - ╞═════════════════╪═════════════════╪═════╪═════════════════╡ - │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ - │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ - │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ - │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ - └─────────────────┴─────────────────┴─────┴─────────────────┘ - - ''' - def join_asof(self, other: LazyFrame) -> Self: - ''' - Perform an asof join. - - This is similar to a left-join except that we match on nearest key rather than - equal keys. - - Both DataFrames must be sorted by the join_asof key. - - For each row in the left DataFrame: - - - A "backward" search selects the last row in the right DataFrame whose - \'on\' key is less than or equal to the left\'s key. - - - A "forward" search selects the first row in the right DataFrame whose - \'on\' key is greater than or equal to the left\'s key. - - A "nearest" search selects the last row in the right DataFrame whose value - is nearest to the left\'s key. String keys are not currently supported for a - nearest search. - - The default is "backward". - - Parameters - ---------- - other - Lazy DataFrame to join with. - left_on - Join column of the left DataFrame. - right_on - Join column of the right DataFrame. - on - Join column of both DataFrames. If set, `left_on` and `right_on` should be - None. - by - Join on these columns before doing asof join. - by_left - Join on these columns before doing asof join. - by_right - Join on these columns before doing asof join. - strategy : {\'backward\', \'forward\', \'nearest\'} - Join strategy. - suffix - Suffix to append to columns with a duplicate name. - tolerance - Numeric tolerance. By setting this the join will only be done if the near - keys are within this distance. If an asof join is done on columns of dtype - "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta - object or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - allow_parallel - Allow the physical plan to optionally evaluate the computation of both - DataFrames up to the join in parallel. - force_parallel - Force the physical plan to evaluate the computation of both DataFrames up to - the join in parallel. - - Examples - -------- - >>> from datetime import datetime - >>> gdp = pl.LazyFrame( - ... { - ... "date": [ - ... datetime(2016, 1, 1), - ... datetime(2017, 1, 1), - ... datetime(2018, 1, 1), - ... datetime(2019, 1, 1), - ... ], # note record date: Jan 1st (sorted!) - ... "gdp": [4164, 4411, 4566, 4696], - ... } - ... ).set_sorted("date") - >>> population = pl.LazyFrame( - ... { - ... "date": [ - ... datetime(2016, 5, 12), - ... datetime(2017, 5, 12), - ... datetime(2018, 5, 12), - ... datetime(2019, 5, 12), - ... ], # note record date: May 12th (sorted!) - ... "population": [82.19, 82.66, 83.12, 83.52], - ... } - ... ).set_sorted("date") - >>> population.join_asof(gdp, on="date", strategy="backward").collect() - shape: (4, 3) - ┌─────────────────────┬────────────┬──────┐ - │ date ┆ population ┆ gdp │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ f64 ┆ i64 │ - ╞═════════════════════╪════════════╪══════╡ - │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ - │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ - │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ - │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ - └─────────────────────┴────────────┴──────┘ - - ''' - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: - ''' - Add a join operation to the Logical Plan. - - Parameters - ---------- - other - Lazy DataFrame to join with. - on - Join column of both DataFrames. If set, `left_on` and `right_on` should be - None. - how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} - Join strategy. - - .. note:: - A left join preserves the row order of the left DataFrame. - left_on - Join column of the left DataFrame. - right_on - Join column of the right DataFrame. - suffix - Suffix to append to columns with a duplicate name. - validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} - Checks if join is of specified type. - - * *many_to_many* - “m:m”: default, does not result in checks - * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets - * *one_to_many* - “1:m”: check if join keys are unique in left dataset - * *many_to_one* - “m:1”: check if join keys are unique in right dataset - - .. note:: - - - This is currently not supported the streaming engine. - - This is only supported when joined by single columns. - allow_parallel - Allow the physical plan to optionally evaluate the computation of both - DataFrames up to the join in parallel. - force_parallel - Force the physical plan to evaluate the computation of both DataFrames up to - the join in parallel. - - See Also - -------- - join_asof - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> other_lf = pl.LazyFrame( - ... { - ... "apple": ["x", "y", "z"], - ... "ham": ["a", "b", "d"], - ... } - ... ) - >>> lf.join(other_lf, on="ham").collect() - shape: (2, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - └─────┴─────┴─────┴───────┘ - >>> lf.join(other_lf, on="ham", how="outer").collect() - shape: (4, 4) - ┌──────┬──────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞══════╪══════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ null ┆ null ┆ d ┆ z │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └──────┴──────┴─────┴───────┘ - >>> lf.join(other_lf, on="ham", how="left").collect() - shape: (3, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └─────┴─────┴─────┴───────┘ - >>> lf.join(other_lf, on="ham", how="semi").collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 7.0 ┆ b │ - └─────┴─────┴─────┘ - >>> lf.join(other_lf, on="ham", how="anti").collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - ''' - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - LazyFrame - A new LazyFrame with the columns added. - - Notes - ----- - Creating a new LazyFrame using this method does not create a new copy of - existing data. - - Examples - -------- - Pass an expression to add it as a new column. - - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() - shape: (4, 4) - ┌─────┬──────┬───────┬──────┐ - │ a ┆ b ┆ c ┆ a^2 │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 │ - ╞═════╪══════╪═══════╪══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ - └─────┴──────┴───────┴──────┘ - - Added columns will replace existing columns with the same name. - - >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() - shape: (4, 3) - ┌─────┬──────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╡ - │ 1.0 ┆ 0.5 ┆ true │ - │ 2.0 ┆ 4.0 ┆ true │ - │ 3.0 ┆ 10.0 ┆ false │ - │ 4.0 ┆ 13.0 ┆ true │ - └─────┴──────┴───────┘ - - Multiple columns can be added by passing a list of expressions. - - >>> lf.with_columns( - ... [ - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ] - ... ).collect() - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Multiple columns also can be added using positional arguments instead of a list. - - >>> lf.with_columns( - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ).collect() - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> lf.with_columns( - ... ab=pl.col("a") * pl.col("b"), - ... not_c=pl.col("c").not_(), - ... ).collect() - shape: (4, 5) - ┌─────┬──────┬───────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ ab ┆ not_c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ - └─────┴──────┴───────┴──────┴───────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... lf.drop("c").with_columns( - ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), - ... ).collect() - ... - shape: (4, 3) - ┌─────┬──────┬─────────────┐ - │ a ┆ b ┆ diffs │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ struct[2] │ - ╞═════╪══════╪═════════════╡ - │ 1 ┆ 0.5 ┆ {null,null} │ - │ 2 ┆ 4.0 ┆ {1,3.5} │ - │ 3 ┆ 10.0 ┆ {1,6.0} │ - │ 4 ┆ 13.0 ┆ {1,3.0} │ - └─────┴──────┴─────────────┘ - - ''' - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - """ - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - LazyFrame - A new LazyFrame with the columns added. - - See Also - -------- - with_columns - - """ - def with_context(self, other: Self | list[Self]) -> Self: - ''' - Add an external context to the computation graph. - - This allows expressions to also access columns from DataFrames - that are not part of this one. - - Parameters - ---------- - other - Lazy DataFrame to join with. - - Examples - -------- - >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) - >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) - >>> lf.with_context(lf_other).select( - ... pl.col("b") + pl.col("c").first() - ... ).collect() - shape: (3, 1) - ┌──────┐ - │ b │ - │ --- │ - │ str │ - ╞══════╡ - │ afoo │ - │ cfoo │ - │ null │ - └──────┘ - - Fill nulls with the median from another DataFrame: - - >>> train_lf = pl.LazyFrame( - ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} - ... ) - >>> test_lf = pl.LazyFrame( - ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} - ... ) - >>> test_lf.with_context( - ... train_lf.select(pl.all().name.suffix("_train")) - ... ).select( - ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) - ... ).collect() - shape: (3, 1) - ┌───────────┐ - │ feature_0 │ - │ --- │ - │ f64 │ - ╞═══════════╡ - │ -1.0 │ - │ 0.0 │ - │ 1.0 │ - └───────────┘ - - ''' - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: - ''' - Remove columns from the DataFrame. - - Parameters - ---------- - columns - Name of the column(s) that should be removed from the DataFrame. - *more_columns - Additional columns to drop, specified as positional arguments. - - Examples - -------- - Drop a single column by passing the name of that column. - - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.drop("ham").collect() - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪═════╡ - │ 1 ┆ 6.0 │ - │ 2 ┆ 7.0 │ - │ 3 ┆ 8.0 │ - └─────┴─────┘ - - Drop multiple columns by passing a selector. - - >>> import polars.selectors as cs - >>> lf.drop(cs.numeric()).collect() - shape: (3, 1) - ┌─────┐ - │ ham │ - │ --- │ - │ str │ - ╞═════╡ - │ a │ - │ b │ - │ c │ - └─────┘ - - Use positional arguments to drop multiple columns. - - >>> lf.drop("foo", "ham").collect() - shape: (3, 1) - ┌─────┐ - │ bar │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 6.0 │ - │ 7.0 │ - │ 8.0 │ - └─────┘ - - ''' - def rename(self, mapping: dict[str, str]) -> Self: - ''' - Rename column names. - - Parameters - ---------- - mapping - Key value pairs that map from old name to new name. - - Notes - ----- - If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), - polars will block projection and predicate pushdowns at this node. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.rename({"foo": "apple"}).collect() - shape: (3, 3) - ┌───────┬─────┬─────┐ - │ apple ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═══════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └───────┴─────┴─────┘ - - ''' - def reverse(self) -> Self: - ''' - Reverse the DataFrame. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "key": ["a", "b", "c"], - ... "val": [1, 2, 3], - ... } - ... ) - >>> lf.reverse().collect() - shape: (3, 2) - ┌─────┬─────┐ - │ key ┆ val │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ c ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 1 │ - └─────┴─────┘ - - ''' - def shift(self, n: int | IntoExprColumn = ...) -> Self: - ''' - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. Accepts expression input. - Non-expression inputs are parsed as literals. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [5, 6, 7, 8], - ... } - ... ) - >>> lf.shift().collect() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ null ┆ null │ - │ 1 ┆ 5 │ - │ 2 ┆ 6 │ - │ 3 ┆ 7 │ - └──────┴──────┘ - - Pass a negative value to shift in the opposite direction instead. - - >>> lf.shift(-2).collect() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ null ┆ null │ - │ null ┆ null │ - └──────┴──────┘ - - Specify `fill_value` to fill the resulting null values. - - >>> lf.shift(-2, fill_value=100).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ 100 ┆ 100 │ - │ 100 ┆ 100 │ - └─────┴─────┘ - - ''' - def slice(self, offset: int, length: int | None = ...) -> Self: - ''' - Get a slice of this DataFrame. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["x", "y", "z"], - ... "b": [1, 3, 5], - ... "c": [2, 4, 6], - ... } - ... ) - >>> lf.slice(1, 2).collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ y ┆ 3 ┆ 4 │ - │ z ┆ 5 ┆ 6 │ - └─────┴─────┴─────┘ - - ''' - def limit(self, n: int = ...) -> Self: - ''' - Get the first `n` rows. - - Alias for :func:`LazyFrame.head`. - - Parameters - ---------- - n - Number of rows to return. - - Notes - ----- - Consider using the :func:`fetch` operation if you only want to test your - query. The :func:`fetch` operation will load the first `n` rows at the scan - level, whereas the :func:`head`/:func:`limit` are applied at the end. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4, 5, 6], - ... "b": [7, 8, 9, 10, 11, 12], - ... } - ... ) - >>> lf.limit().collect() - shape: (5, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - │ 4 ┆ 10 │ - │ 5 ┆ 11 │ - └─────┴─────┘ - >>> lf.limit(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - └─────┴─────┘ - - ''' - def head(self, n: int = ...) -> Self: - ''' - Get the first `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Notes - ----- - Consider using the :func:`fetch` operation if you only want to test your - query. The :func:`fetch` operation will load the first `n` rows at the scan - level, whereas the :func:`head`/:func:`limit` are applied at the end. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4, 5, 6], - ... "b": [7, 8, 9, 10, 11, 12], - ... } - ... ) - >>> lf.head().collect() - shape: (5, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - │ 4 ┆ 10 │ - │ 5 ┆ 11 │ - └─────┴─────┘ - >>> lf.head(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - └─────┴─────┘ - - ''' - def tail(self, n: int = ...) -> Self: - ''' - Get the last `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4, 5, 6], - ... "b": [7, 8, 9, 10, 11, 12], - ... } - ... ) - >>> lf.tail().collect() - shape: (5, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - │ 4 ┆ 10 │ - │ 5 ┆ 11 │ - │ 6 ┆ 12 │ - └─────┴─────┘ - >>> lf.tail(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 5 ┆ 11 │ - │ 6 ┆ 12 │ - └─────┴─────┘ - - ''' - def last(self) -> Self: - ''' - Get the last row of the DataFrame. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> lf.last().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 5 ┆ 6 │ - └─────┴─────┘ - - ''' - def first(self) -> Self: - ''' - Get the first row of the DataFrame. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> lf.first().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 2 │ - └─────┴─────┘ - - ''' - def approx_n_unique(self) -> Self: - ''' - Approximate count of unique values. - - This is done using the HyperLogLog++ algorithm for cardinality estimation. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.approx_n_unique().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def approx_unique(self) -> Self: - """ - Approximate count of unique values. - - .. deprecated:: 0.18.12 - This method has been renamed to :func:`LazyFrame.approx_n_unique`. - - """ - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: - ''' - Add a column at index 0 that counts the rows. - - Parameters - ---------- - name - Name of the column to add. - offset - Start the row count at this offset. - - Warnings - -------- - This can have a negative effect on query performance. - This may, for instance, block predicate pushdown optimization. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> lf.with_row_count().collect() - shape: (3, 3) - ┌────────┬─────┬─────┐ - │ row_nr ┆ a ┆ b │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ i64 ┆ i64 │ - ╞════════╪═════╪═════╡ - │ 0 ┆ 1 ┆ 2 │ - │ 1 ┆ 3 ┆ 4 │ - │ 2 ┆ 5 ┆ 6 │ - └────────┴─────┴─────┘ - - ''' - def gather_every(self, n: int) -> Self: - ''' - Take every nth row in the LazyFrame and return as a new LazyFrame. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [5, 6, 7, 8], - ... } - ... ) - >>> lf.gather_every(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 5 │ - │ 3 ┆ 7 │ - └─────┴─────┘ - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: - ''' - Fill null values using the specified value or strategy. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - matches_supertype - Fill all matching supertypes of the fill `value` literal. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, None, 4], - ... "b": [0.5, 4, None, 13], - ... } - ... ) - >>> lf.fill_null(99).collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 99 ┆ 99.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - >>> lf.fill_null(strategy="forward").collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> lf.fill_null(strategy="max").collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> lf.fill_null(strategy="zero").collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 0 ┆ 0.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - ''' - def fill_nan(self, value: int | float | Expr | None) -> Self: - ''' - Fill floating point NaN values. - - Parameters - ---------- - value - Value to fill the NaN values with. - - Warnings - -------- - Note that floating point NaN (Not a Number) are not missing values! - To replace missing values, use :func:`fill_null` instead. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1.5, 2, float("nan"), 4], - ... "b": [0.5, 4, float("nan"), 13], - ... } - ... ) - >>> lf.fill_nan(99).collect() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪══════╡ - │ 1.5 ┆ 0.5 │ - │ 2.0 ┆ 4.0 │ - │ 99.0 ┆ 99.0 │ - │ 4.0 ┆ 13.0 │ - └──────┴──────┘ - - ''' - def std(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns in the LazyFrame to their standard deviation value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.std().collect() - shape: (1, 2) - ┌──────────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════════╪═════╡ - │ 1.290994 ┆ 0.5 │ - └──────────┴─────┘ - >>> lf.std(ddof=0).collect() - shape: (1, 2) - ┌──────────┬──────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════════╪══════════╡ - │ 1.118034 ┆ 0.433013 │ - └──────────┴──────────┘ - - ''' - def var(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns in the LazyFrame to their variance value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.var().collect() - shape: (1, 2) - ┌──────────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════════╪══════╡ - │ 1.666667 ┆ 0.25 │ - └──────────┴──────┘ - >>> lf.var(ddof=0).collect() - shape: (1, 2) - ┌──────┬────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪════════╡ - │ 1.25 ┆ 0.1875 │ - └──────┴────────┘ - - ''' - def max(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their maximum value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.max().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def min(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their minimum value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.min().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 1 │ - └─────┴─────┘ - - ''' - def sum(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their sum value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.sum().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 10 ┆ 5 │ - └─────┴─────┘ - - ''' - def mean(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their mean value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.mean().collect() - shape: (1, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════╡ - │ 2.5 ┆ 1.25 │ - └─────┴──────┘ - - ''' - def median(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their median value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.median().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 2.5 ┆ 1.0 │ - └─────┴─────┘ - - ''' - def null_count(self) -> Self: - ''' - Aggregate the columns in the LazyFrame as the sum of their null value count. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, None, 3], - ... "bar": [6, 7, None], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.null_count().collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ u32 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 1 ┆ 0 │ - └─────┴─────┴─────┘ - - ''' - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: - ''' - Aggregate the columns in the LazyFrame to their quantile value. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.quantile(0.7).collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 3.0 ┆ 1.0 │ - └─────┴─────┘ - - ''' - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: - ''' - Explode the DataFrame to long format by exploding the given columns. - - Parameters - ---------- - columns - Column names, expressions, or a selector defining them. The underlying - columns being exploded must be of List or Utf8 datatype. - *more_columns - Additional names of columns to explode, specified as positional arguments. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "letters": ["a", "a", "b", "c"], - ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], - ... } - ... ) - >>> lf.explode("numbers").collect() - shape: (8, 2) - ┌─────────┬─────────┐ - │ letters ┆ numbers │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════════╪═════════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ a ┆ 3 │ - │ b ┆ 4 │ - │ b ┆ 5 │ - │ c ┆ 6 │ - │ c ┆ 7 │ - │ c ┆ 8 │ - └─────────┴─────────┘ - - ''' - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: - ''' - Drop duplicate rows from this DataFrame. - - Parameters - ---------- - subset - Column name(s) or selector(s), to consider when identifying - duplicate rows. If set to `None` (default), use all columns. - keep : {\'first\', \'last\', \'any\', \'none\'} - Which of the duplicate rows to keep. - - * \'any\': Does not give any guarantee of which row is kept. - This allows more optimizations. - * \'none\': Don\'t keep duplicate rows. - * \'first\': Keep first unique row. - * \'last\': Keep last unique row. - maintain_order - Keep the same order as the original DataFrame. This is more expensive to - compute. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - Returns - ------- - LazyFrame - LazyFrame with unique rows. - - Warnings - -------- - This method will fail if there is a column of type `List` in the DataFrame or - subset. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3, 1], - ... "bar": ["a", "a", "a", "a"], - ... "ham": ["b", "b", "b", "b"], - ... } - ... ) - >>> lf.unique(maintain_order=True).collect() - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> lf.unique(keep="last", maintain_order=True).collect() - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - - ''' - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: - ''' - Drop all rows that contain null values. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - subset - Column name(s) for which null values are considered. - If set to `None` (default), use all columns. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, None, 8], - ... "ham": ["a", "b", None], - ... } - ... ) - - The default behavior of this method is to drop rows where any single - value of the row is null. - - >>> lf.drop_nulls().collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - This behaviour can be constrained to consider only a subset of columns, as - defined by name or with a selector. For example, dropping rows if there is - a null in any of the integer columns: - - >>> import polars.selectors as cs - >>> lf.drop_nulls(subset=cs.integer()).collect() - shape: (2, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ null │ - └─────┴─────┴──────┘ - - This method drops a row if any single value of the row is null. - - Below are some example snippets that show how you could drop null - values based on other conditions: - - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, None, None, None], - ... "b": [1, 2, None, 1], - ... "c": [1, None, None, 1], - ... } - ... ) - >>> lf.collect() - shape: (4, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪══════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ null ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴──────┴──────┘ - - Drop a row only if all values are null: - - >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() - shape: (3, 3) - ┌──────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪═════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴─────┴──────┘ - - ''' - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: - ''' - Unpivot a DataFrame from wide to long format. - - Optionally leaves identifiers set. - - This function is useful to massage a DataFrame into a format where one or more - columns are identifier variables (id_vars) while all other columns, considered - measured variables (value_vars), are "unpivoted" to the row axis leaving just - two non-identifier columns, \'variable\' and \'value\'. - - Parameters - ---------- - id_vars - Column(s) or selector(s) to use as identifier variables. - value_vars - Column(s) or selector(s) to use as values variables; if `value_vars` - is empty all columns that are not in `id_vars` will be used. - variable_name - Name to give to the `variable` column. Defaults to "variable" - value_name - Name to give to the `value` column. Defaults to "value" - streamable - Allow this node to run in the streaming engine. - If this runs in streaming, the output of the melt operation - will not have a stable ordering. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["x", "y", "z"], - ... "b": [1, 3, 5], - ... "c": [2, 4, 6], - ... } - ... ) - >>> import polars.selectors as cs - >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() - shape: (6, 3) - ┌─────┬──────────┬───────┐ - │ a ┆ variable ┆ value │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 │ - ╞═════╪══════════╪═══════╡ - │ x ┆ b ┆ 1 │ - │ y ┆ b ┆ 3 │ - │ z ┆ b ┆ 5 │ - │ x ┆ c ┆ 2 │ - │ y ┆ c ┆ 4 │ - │ z ┆ c ┆ 6 │ - └─────┴──────────┴───────┘ - - ''' - def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: - ''' - Apply a custom function. - - It is important that the function returns a Polars DataFrame. - - Parameters - ---------- - function - Lambda/ function to apply. - predicate_pushdown - Allow predicate pushdown optimization to pass this node. - projection_pushdown - Allow projection pushdown optimization to pass this node. - slice_pushdown - Allow slice pushdown optimization to pass this node. - no_optimizations - Turn off all optimizations past this point. - schema - Output schema of the function, if set to `None` we assume that the schema - will remain unchanged by the applied function. - validate_output_schema - It is paramount that polars\' schema is correct. This flag will ensure that - the output schema of this function will be checked with the expected schema. - Setting this to `False` will not do this check, but may lead to hard to - debug bugs. - streamable - Whether the function that is given is eligible to be running with the - streaming engine. That means that the function must produce the same result - when it is executed in batches or when it is be executed on the full - dataset. - - Warnings - -------- - The `schema` of a `LazyFrame` must always be correct. It is up to the caller - of this function to ensure that this invariant is upheld. - - It is important that the optimization flags are correct. If the custom function - for instance does an aggregation of a column, `predicate_pushdown` should not - be allowed, as this prunes rows and will influence your aggregation results. - - Examples - -------- - >>> lf = ( # doctest: +SKIP - ... pl.LazyFrame( - ... { - ... "a": pl.int_range(-100_000, 0, eager=True), - ... "b": pl.int_range(0, 100_000, eager=True), - ... } - ... ) - ... .map_batches(lambda x: 2 * x, streamable=True) - ... .collect(streaming=True) - ... ) - shape: (100_000, 2) - ┌─────────┬────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════════╪════════╡ - │ -200000 ┆ 0 │ - │ -199998 ┆ 2 │ - │ -199996 ┆ 4 │ - │ -199994 ┆ 6 │ - │ … ┆ … │ - │ -8 ┆ 199992 │ - │ -6 ┆ 199994 │ - │ -4 ┆ 199996 │ - │ -2 ┆ 199998 │ - └─────────┴────────┘ - - ''' - def interpolate(self) -> Self: - ''' - Interpolate intermediate values. The interpolation method is linear. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, None, 9, 10], - ... "bar": [6, 7, 9, None], - ... "baz": [1, None, None, 9], - ... } - ... ) - >>> lf.interpolate().collect() - shape: (4, 3) - ┌──────┬──────┬──────────┐ - │ foo ┆ bar ┆ baz │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 │ - ╞══════╪══════╪══════════╡ - │ 1.0 ┆ 6.0 ┆ 1.0 │ - │ 5.0 ┆ 7.0 ┆ 3.666667 │ - │ 9.0 ┆ 9.0 ┆ 6.333333 │ - │ 10.0 ┆ null ┆ 9.0 │ - └──────┴──────┴──────────┘ - - ''' - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: - ''' - Decompose struct columns into separate columns for each of their fields. - - The new columns will be inserted into the DataFrame at the location of the - struct column. - - Parameters - ---------- - columns - Name of the struct column(s) that should be unnested. - *more_columns - Additional columns to unnest, specified as positional arguments. - - Examples - -------- - >>> df = pl.LazyFrame( - ... { - ... "before": ["foo", "bar"], - ... "t_a": [1, 2], - ... "t_b": ["a", "b"], - ... "t_c": [True, None], - ... "t_d": [[1, 2], [3]], - ... "after": ["baz", "womp"], - ... } - ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") - >>> df.collect() - shape: (2, 3) - ┌────────┬─────────────────────┬───────┐ - │ before ┆ t_struct ┆ after │ - │ --- ┆ --- ┆ --- │ - │ str ┆ struct[4] ┆ str │ - ╞════════╪═════════════════════╪═══════╡ - │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ - │ bar ┆ {2,"b",null,[3]} ┆ womp │ - └────────┴─────────────────────┴───────┘ - >>> df.unnest("t_struct").collect() - shape: (2, 6) - ┌────────┬─────┬─────┬──────┬───────────┬───────┐ - │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ - ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ - │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ - │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ - └────────┴─────┴─────┴──────┴───────────┴───────┘ - - ''' - def merge_sorted(self, other: LazyFrame, key: str) -> Self: - ''' - Take two sorted DataFrames and merge them by the sorted key. - - The output of this operation will also be sorted. - It is the callers responsibility that the frames are sorted - by that key otherwise the output will not make sense. - - The schemas of both LazyFrames must be equal. - - Parameters - ---------- - other - Other DataFrame that must be merged - key - Key that is sorted. - - Examples - -------- - >>> df0 = pl.LazyFrame( - ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} - ... ).sort("age") - >>> df0.collect() - shape: (3, 2) - ┌───────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═══════╪═════╡ - │ bob ┆ 18 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └───────┴─────┘ - >>> df1 = pl.LazyFrame( - ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} - ... ).sort("age") - >>> df1.collect() - shape: (4, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - └────────┴─────┘ - >>> df0.merge_sorted(df1, key="age").collect() - shape: (7, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ bob ┆ 18 │ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └────────┴─────┘ - ''' - def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: - """ - Indicate that one or multiple columns are sorted. - - Parameters - ---------- - column - Columns that are sorted - more_columns - Additional columns that are sorted, specified as positional arguments. - descending - Whether the columns are sorted in descending order. - """ - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: - ''' - Update the values in this `LazyFrame` with the non-null values in `other`. - - Parameters - ---------- - other - LazyFrame that will be used to update the values - on - Column names that will be joined on; if given `None` the implicit row - index is used as a join key instead. - left_on - Join column(s) of the left DataFrame. - right_on - Join column(s) of the right DataFrame. - how : {\'left\', \'inner\', \'outer\'} - * \'left\' will keep all rows from the left table; rows may be duplicated - if multiple rows in the right frame match the left row\'s key. - * \'inner\' keeps only those rows where the key exists in both frames. - * \'outer\' will update existing rows where the key matches while also - adding any new rows contained in the given frame. - include_nulls - If True, null values from the right DataFrame will be used to update the - left DataFrame. - - Notes - ----- - This is syntactic sugar for a left/inner join, with an optional coalesce when - `include_nulls = False`. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "A": [1, 2, 3, 4], - ... "B": [400, 500, 600, 700], - ... } - ... ) - >>> lf.collect() - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 400 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - >>> new_lf = pl.LazyFrame( - ... { - ... "B": [-66, None, -99], - ... "C": [5, 3, 1], - ... } - ... ) - - Update `df` values with the non-null values in `new_df`, by row index: - - >>> lf.update(new_lf).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, by row index, - but only keeping those rows that are common to both frames: - - >>> lf.update(new_lf, how="inner").collect() - shape: (3, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() - shape: (5, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴─────┘ - - Update `df` values including null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> lf.update( - ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True - ... ).collect() - shape: (5, 2) - ┌─────┬──────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ null │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴──────┘ - - ''' - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: - """ - Start a group by operation. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.group_by`. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - """ - def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - """ - def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.9 - This method has been renamed to :func:`LazyFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - """ - def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.group_by_dynamic`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - include_boundaries - Add the lower and upper bound of the window to the "_lower_bound" and - "_upper_bound" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - ''' - def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: - """ - Apply a custom function. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.map_batches`. - - Parameters - ---------- - function - Lambda/ function to apply. - predicate_pushdown - Allow predicate pushdown optimization to pass this node. - projection_pushdown - Allow projection pushdown optimization to pass this node. - slice_pushdown - Allow slice pushdown optimization to pass this node. - no_optimizations - Turn off all optimizations past this point. - schema - Output schema of the function, if set to `None` we assume that the schema - will remain unchanged by the applied function. - validate_output_schema - It is paramount that polars' schema is correct. This flag will ensure that - the output schema of this function will be checked with the expected schema. - Setting this to `False` will not do this check, but may lead to hard to - debug bugs. - streamable - Whether the function that is given is eligible to be running with the - streaming engine. That means that the function must produce the same result - when it is executed in batches or when it is be executed on the full - dataset. - - """ - def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - fill None values with the result of this expression. - n - Number of places to shift (may be negative). - - """ - def take_every(self, n: int) -> Self: - """ - Take every nth row in the LazyFrame and return as a new LazyFrame. - - .. deprecated:: 0.19.0 - This method has been renamed to :meth:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - @property - def columns(self): ... - @property - def dtypes(self): ... - @property - def schema(self): ... - @property - def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/lazyframe/frame.pyi similarity index 99% rename from polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/lazyframe/frame rename to polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/lazyframe/frame.pyi index 561f5b2..4d60802 100644 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/lazyframe/frame +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/lazyframe/frame.pyi @@ -1,3 +1,4 @@ +#: version 0.19.19 import P import np import pa diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/series/series deleted file mode 100644 index 4a40006..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/series/series +++ /dev/null @@ -1,4988 +0,0 @@ -import np as np -import pa as pa -import pd as pd -from builtins import PySeries -from datetime import date, datetime, timedelta -from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Object as Object, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown, Utf8 as Utf8 -from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat -from polars.exceptions import ShapeError as ShapeError -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence - -TYPE_CHECKING: bool -_PYARROW_AVAILABLE: bool - -class Series: - _s: _ClassVar[None] = ... - _accessors: _ClassVar[set] = ... - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array) -> Self: - """Construct a Series from an Arrow Array.""" - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: - """Construct a Series from a pandas Series or DatetimeIndex.""" - def _get_ptr(self) -> tuple[int, int, int]: - """ - Get a pointer to the start of the values buffer of a numeric Series. - - This will raise an error if the `Series` contains multiple chunks. - - This will return the offset, length and the pointer itself. - - """ - def __bool__(self) -> NoReturn: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Series: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... - def __eq__(self, other: Any) -> Series | Expr: ... - def __ne__(self, other: Any) -> Series | Expr: ... - def __gt__(self, other: Any) -> Series | Expr: ... - def __lt__(self, other: Any) -> Series | Expr: ... - def __ge__(self, other: Any) -> Series | Expr: ... - def __le__(self, other: Any) -> Series | Expr: ... - def le(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series <= other`.""" - def lt(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series < other`.""" - def eq(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series == other`.""" - def eq_missing(self, other: Any) -> Self | Expr: - ''' - Method equivalent of equality operator `series == other` where `None == None`. - - This differs from the standard `ne` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - See Also - -------- - ne_missing - eq - - Examples - -------- - >>> s1 = pl.Series("a", [333, 200, None]) - >>> s2 = pl.Series("a", [100, 200, None]) - >>> s1.eq(s2) - shape: (3,) - Series: \'a\' [bool] - [ - false - true - null - ] - >>> s1.eq_missing(s2) - shape: (3,) - Series: \'a\' [bool] - [ - false - true - true - ] - - ''' - def ne(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series != other`.""" - def ne_missing(self, other: Any) -> Self | Expr: - ''' - Method equivalent of equality operator `series != other` where `None == None`. - - This differs from the standard `ne` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - See Also - -------- - eq_missing - ne - - Examples - -------- - >>> s1 = pl.Series("a", [333, 200, None]) - >>> s2 = pl.Series("a", [100, 200, None]) - >>> s1.ne(s2) - shape: (3,) - Series: \'a\' [bool] - [ - true - false - null - ] - >>> s1.ne_missing(s2) - shape: (3,) - Series: \'a\' [bool] - [ - true - false - false - ] - - ''' - def ge(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series >= other`.""" - def gt(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series > other`.""" - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - def __add__(self, other: Any) -> Self | DataFrame | Expr: ... - def __sub__(self, other: Any) -> Self | Expr: ... - def __truediv__(self, other: Any) -> Series | Expr: ... - def __floordiv__(self, other: Any) -> Series | Expr: ... - def __invert__(self) -> Series: ... - def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... - def __mod__(self, other: Any) -> Series | Expr: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: - """ - Numpy __array__ interface protocol. - - Ensures that `np.asarray(pl.Series(..))` works as expected, see - https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. - """ - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: - """Numpy universal functions.""" - def __column_consortium_standard__(self) -> Any: - """ - Provide entry point to the Consortium DataFrame Standard API. - - This is developed and maintained outside of polars. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. - """ - def _repr_html_(self) -> str: - """Format output data in HTML for display in Jupyter Notebooks.""" - def item(self, index: int | None = ...) -> Any: - ''' - Return the Series as a scalar, or return the element at the given index. - - If no index is provided, this is equivalent to `s[0]`, with a check - that the shape is (1,). With an index, this is equivalent to `s[index]`. - - Examples - -------- - >>> s1 = pl.Series("a", [1]) - >>> s1.item() - 1 - >>> s2 = pl.Series("a", [9, 8, 7]) - >>> s2.cum_sum().item(-1) - 24 - - ''' - def estimated_size(self, unit: SizeUnit = ...) -> int | float: - ''' - Return an estimation of the total (heap) allocated size of the Series. - - Estimated size is given in the specified unit (bytes by default). - - This estimation is the sum of the size of its buffers, validity, including - nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the - size of 2 arrays is not the sum of the sizes computed from this function. In - particular, [`StructArray`]\'s size is an upper bound. - - When an array is sliced, its allocated size remains constant because the buffer - unchanged. However, this function will yield a smaller number. This is because - this function returns the visible size of the buffer, not its total capacity. - - FFI buffers are included in this estimation. - - Parameters - ---------- - unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} - Scale the returned size to the given unit. - - Examples - -------- - >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) - >>> s.estimated_size() - 4000000 - >>> s.estimated_size("mb") - 3.814697265625 - - ''' - def sqrt(self) -> Series: - """ - Compute the square root of the elements. - - Syntactic sugar for - - >>> pl.Series([1, 2]) ** 0.5 - shape: (2,) - Series: '' [f64] - [ - 1.0 - 1.414214 - ] - - """ - def cbrt(self) -> Series: - """ - Compute the cube root of the elements. - - Optimization for - - >>> pl.Series([1, 2]) ** (1.0 / 3) - shape: (2,) - Series: '' [f64] - [ - 1.0 - 1.259921 - ] - - """ - def any(self) -> bool | None: - """ - Return whether any of the values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is `None`. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - bool or None - - Examples - -------- - >>> pl.Series([True, False]).any() - True - >>> pl.Series([False, False]).any() - False - >>> pl.Series([None, False]).any() - False - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None - - """ - def all(self) -> bool | None: - """ - Return whether all values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is `None`. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - bool or None - - Examples - -------- - >>> pl.Series([True, True]).all() - True - >>> pl.Series([False, True]).all() - False - >>> pl.Series([None, True]).all() - True - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None - - """ - def log(self, base: float = ...) -> Series: - """Compute the logarithm to a given base.""" - def log1p(self) -> Series: - """Compute the natural logarithm of the input array plus one, element-wise.""" - def log10(self) -> Series: - """Compute the base 10 logarithm of the input array, element-wise.""" - def exp(self) -> Series: - """Compute the exponential, element-wise.""" - def drop_nulls(self) -> Series: - ''' - Drop all null values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nans - - Notes - ----- - A null value is not the same as a NaN value. - To drop NaN values, use :func:`drop_nans`. - - Examples - -------- - >>> s = pl.Series([1.0, None, 3.0, float("nan")]) - >>> s.drop_nulls() - shape: (3,) - Series: \'\' [f64] - [ - 1.0 - 3.0 - NaN - ] - - ''' - def drop_nans(self) -> Series: - ''' - Drop all floating point NaN values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nulls - - Notes - ----- - A NaN value is not the same as a null value. - To drop null values, use :func:`drop_nulls`. - - Examples - -------- - >>> s = pl.Series([1.0, None, 3.0, float("nan")]) - >>> s.drop_nans() - shape: (3,) - Series: \'\' [f64] - [ - 1.0 - null - 3.0 - ] - - ''' - def to_frame(self, name: str | None = ...) -> DataFrame: - ''' - Cast this Series to a DataFrame. - - Parameters - ---------- - name - optionally name/rename the Series column in the new DataFrame. - - Examples - -------- - >>> s = pl.Series("a", [123, 456]) - >>> df = s.to_frame() - >>> df - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 123 │ - │ 456 │ - └─────┘ - - >>> df = s.to_frame("xyz") - >>> df - shape: (2, 1) - ┌─────┐ - │ xyz │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 123 │ - │ 456 │ - └─────┘ - - ''' - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: - ''' - Quick summary statistics of a Series. - - Series with mixed datatypes will return summary statistics for the datatype of - the first value. - - Parameters - ---------- - percentiles - One or more percentiles to include in the summary statistics (if the - Series has a numeric dtype). All values must be in the range `[0, 1]`. - - Notes - ----- - The median is included by default as the 50% percentile. - - Returns - ------- - DataFrame - Mapping with summary statistics of a Series. - - Examples - -------- - >>> series_num = pl.Series([1, 2, 3, 4, 5]) - >>> series_num.describe() - shape: (9, 2) - ┌────────────┬──────────┐ - │ statistic ┆ value │ - │ --- ┆ --- │ - │ str ┆ f64 │ - ╞════════════╪══════════╡ - │ count ┆ 5.0 │ - │ null_count ┆ 0.0 │ - │ mean ┆ 3.0 │ - │ std ┆ 1.581139 │ - │ min ┆ 1.0 │ - │ 25% ┆ 2.0 │ - │ 50% ┆ 3.0 │ - │ 75% ┆ 4.0 │ - │ max ┆ 5.0 │ - └────────────┴──────────┘ - - >>> series_str = pl.Series(["a", "a", None, "b", "c"]) - >>> series_str.describe() - shape: (3, 2) - ┌────────────┬───────┐ - │ statistic ┆ value │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════════╪═══════╡ - │ count ┆ 5 │ - │ null_count ┆ 1 │ - │ unique ┆ 4 │ - └────────────┴───────┘ - - ''' - def sum(self) -> int | float: - ''' - Reduce this Series to the sum value. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.sum() - 6 - - ''' - def mean(self) -> int | float | None: - ''' - Reduce this Series to the mean value. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.mean() - 2.0 - - ''' - def product(self) -> int | float: - """Reduce this Series to the product value.""" - def pow(self, exponent: int | float | None | Series) -> Series: - ''' - Raise to the power of the given exponent. - - Parameters - ---------- - exponent - The exponent. Accepts Series input. - - Examples - -------- - >>> s = pl.Series("foo", [1, 2, 3, 4]) - >>> s.pow(3) - shape: (4,) - Series: \'foo\' [f64] - [ - 1.0 - 8.0 - 27.0 - 64.0 - ] - - ''' - def min(self) -> PythonLiteral | None: - ''' - Get the minimal value in this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.min() - 1 - - ''' - def max(self) -> PythonLiteral | None: - ''' - Get the maximum value in this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.max() - 3 - - ''' - def nan_max(self) -> int | float | date | datetime | timedelta | str: - """ - Get maximum value, but propagate/poison encountered NaN values. - - This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - """ - def nan_min(self) -> int | float | date | datetime | timedelta | str: - """ - Get minimum value, but propagate/poison encountered NaN values. - - This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - """ - def std(self, ddof: int = ...) -> float | None: - ''' - Get the standard deviation of this Series. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.std() - 1.0 - - ''' - def var(self, ddof: int = ...) -> float | None: - ''' - Get variance of this Series. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.var() - 1.0 - - ''' - def median(self) -> float | None: - ''' - Get the median of this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.median() - 2.0 - - ''' - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: - ''' - Get the quantile value of this Series. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.quantile(0.5) - 2.0 - - ''' - def to_dummies(self, separator: str = ...) -> DataFrame: - ''' - Get dummy/indicator variables. - - Parameters - ---------- - separator - Separator/delimiter used when generating column names. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.to_dummies() - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a_1 ┆ a_2 ┆ a_3 │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 0 ┆ 0 │ - │ 0 ┆ 1 ┆ 0 │ - │ 0 ┆ 0 ┆ 1 │ - └─────┴─────┴─────┘ - - ''' - def cut(self, breaks: Sequence[float]) -> Series | DataFrame: - ''' - Bin continuous values into discrete categories. - - Parameters - ---------- - breaks - List of unique cut points. - labels - Names of the categories. The number of labels must be equal to the number - of cut points plus one. - break_point_label - Name of the breakpoint column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - category_label - Name of the category column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - left_closed - Set the intervals to be left-closed instead of right-closed. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - as_series - If set to `False`, return a DataFrame containing the original values, - the breakpoints, and the categories. - - .. deprecated:: 0.19.0 - This parameter will be removed. The same behavior can be achieved by - setting `include_breaks=True`, unnesting the resulting struct Series, - and adding the result to the original Series. - - Returns - ------- - Series - Series of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise a Series of data type :class:`Struct`. - - See Also - -------- - qcut - - Examples - -------- - Divide the column into three categories. - - >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) - >>> s.cut([-1, 1], labels=["a", "b", "c"]) - shape: (5,) - Series: \'foo\' [cat] - [ - "a" - "a" - "b" - "b" - "c" - ] - - Create a DataFrame with the breakpoint and category for each value. - - >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") - >>> s.to_frame().with_columns(cut).unnest("cut") - shape: (5, 3) - ┌─────┬─────────────┬────────────┐ - │ foo ┆ break_point ┆ category │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪═════════════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴─────────────┴────────────┘ - - ''' - def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: - ''' - Bin continuous values into discrete categories based on their quantiles. - - Parameters - ---------- - quantiles - Either a list of quantile probabilities between 0 and 1 or a positive - integer determining the number of bins with uniform probability. - labels - Names of the categories. The number of labels must be equal to the number - of cut points plus one. - left_closed - Set the intervals to be left-closed instead of right-closed. - allow_duplicates - If set to `True`, duplicates in the resulting quantiles are dropped, - rather than raising a `DuplicateError`. This can happen even with unique - probabilities, depending on the data. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - break_point_label - Name of the breakpoint column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - category_label - Name of the category column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - as_series - If set to `False`, return a DataFrame containing the original values, - the breakpoints, and the categories. - - .. deprecated:: 0.19.0 - This parameter will be removed. The same behavior can be achieved by - setting `include_breaks=True`, unnesting the resulting struct Series, - and adding the result to the original Series. - - Returns - ------- - Series - Series of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise a Series of data type :class:`Struct`. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - See Also - -------- - cut - - Examples - -------- - Divide a column into three categories according to pre-defined quantile - probabilities. - - >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) - >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) - shape: (5,) - Series: \'foo\' [cat] - [ - "a" - "a" - "b" - "b" - "c" - ] - - Divide a column into two categories using uniform quantile probabilities. - - >>> s.qcut(2, labels=["low", "high"], left_closed=True) - shape: (5,) - Series: \'foo\' [cat] - [ - "low" - "low" - "high" - "high" - "high" - ] - - Create a DataFrame with the breakpoint and category for each value. - - >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") - >>> s.to_frame().with_columns(cut).unnest("cut") - shape: (5, 3) - ┌─────┬─────────────┬────────────┐ - │ foo ┆ break_point ┆ category │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪═════════════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴─────────────┴────────────┘ - - ''' - def rle(self) -> Series: - ''' - Get the lengths of runs of identical values. - - Returns - ------- - Series - Series of data type :class:`Struct` with Fields "lengths" and "values". - - Examples - -------- - >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) - >>> s.rle().struct.unnest() - shape: (6, 2) - ┌─────────┬────────┐ - │ lengths ┆ values │ - │ --- ┆ --- │ - │ i32 ┆ i64 │ - ╞═════════╪════════╡ - │ 2 ┆ 1 │ - │ 1 ┆ 2 │ - │ 1 ┆ 1 │ - │ 1 ┆ null │ - │ 1 ┆ 1 │ - │ 2 ┆ 3 │ - └─────────┴────────┘ - ''' - def rle_id(self) -> Series: - ''' - Map values to run IDs. - - Similar to RLE, but it maps each value to an ID corresponding to the run into - which it falls. This is especially useful when you want to define groups by - runs of identical values rather than the values themselves. - - Returns - ------- - Series - - See Also - -------- - rle - - Examples - -------- - >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) - >>> s.rle_id() - shape: (8,) - Series: \'s\' [u32] - [ - 0 - 0 - 1 - 2 - 3 - 4 - 5 - 5 - ] - ''' - def hist(self, bins: list[float] | None = ...) -> DataFrame: - ''' - Bin values into buckets and count their occurrences. - - Parameters - ---------- - bins - Discretizations to make. - If None given, we determine the boundaries based on the data. - bin_count - If no bins provided, this will be used to determine - the distance of the bins - - Returns - ------- - DataFrame - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Examples - -------- - >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) - >>> a.hist(bin_count=4) - shape: (5, 3) - ┌─────────────┬─────────────┬─────────┐ - │ break_point ┆ category ┆ a_count │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ cat ┆ u32 │ - ╞═════════════╪═════════════╪═════════╡ - │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ - │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ - │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ - │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ - │ inf ┆ (6.75, inf] ┆ 2 │ - └─────────────┴─────────────┴─────────┘ - - ''' - def value_counts(self) -> DataFrame: - ''' - Count the occurrences of unique values. - - Parameters - ---------- - sort - Sort the output by count in descending order. - If set to `False` (default), the order of the output is random. - parallel - Execute the computation in parallel. - - .. note:: - This option should likely not be enabled in a group by context, - as the computation is already parallelized per group. - - Returns - ------- - DataFrame - Mapping of unique values to their count. - - Examples - -------- - >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) - >>> s.value_counts() # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌───────┬────────┐ - │ color ┆ counts │ - │ --- ┆ --- │ - │ str ┆ u32 │ - ╞═══════╪════════╡ - │ red ┆ 2 │ - │ green ┆ 1 │ - │ blue ┆ 3 │ - └───────┴────────┘ - - Sort the output by count. - - shape: (3, 2) - ┌───────┬────────┐ - │ color ┆ counts │ - │ --- ┆ --- │ - │ str ┆ u32 │ - ╞═══════╪════════╡ - │ blue ┆ 3 │ - │ red ┆ 2 │ - │ green ┆ 1 │ - └───────┴────────┘ - - ''' - def unique_counts(self) -> Series: - ''' - Return a count of the unique values in the order of appearance. - - Examples - -------- - >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) - >>> s.unique_counts() - shape: (3,) - Series: \'id\' [u32] - [ - 1 - 2 - 3 - ] - - ''' - def entropy(self, base: float = ...) -> float | None: - """ - Computes the entropy. - - Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. - - Parameters - ---------- - base - Given base, defaults to `e` - normalize - Normalize pk if it doesn't sum to 1. - - Examples - -------- - >>> a = pl.Series([0.99, 0.005, 0.005]) - >>> a.entropy(normalize=True) - 0.06293300616044681 - >>> b = pl.Series([0.65, 0.10, 0.25]) - >>> b.entropy(normalize=True) - 0.8568409950394724 - - """ - def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: - ''' - Run an expression over a sliding window that increases `1` slot every iteration. - - Parameters - ---------- - expr - Expression to evaluate - min_periods - Number of valid values there should be in the window before the expression - is evaluated. valid values = `length - null_count` - parallel - Run in parallel. Don\'t do this in a group by or another operation that - already has much parallelization. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - This can be really slow as it can have `O(n^2)` complexity. Don\'t use this - for operations that visit all elements. - - Examples - -------- - >>> s = pl.Series("values", [1, 2, 3, 4, 5]) - >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) - shape: (5,) - Series: \'values\' [f64] - [ - 0.0 - -3.0 - -8.0 - -15.0 - -24.0 - ] - - ''' - def alias(self, name: str) -> Series: - ''' - Rename the series. - - Parameters - ---------- - name - The new name. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.alias("b") - shape: (3,) - Series: \'b\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def rename(self, name: str) -> Series: - ''' - Rename this Series. - - Alias for :func:`Series.alias`. - - Parameters - ---------- - name - New name. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.rename("b") - shape: (3,) - Series: \'b\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def chunk_lengths(self) -> list[int]: - ''' - Get the length of each individual chunk. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("a", [4, 5, 6]) - - Concatenate Series with rechunk = True - - >>> pl.concat([s, s2]).chunk_lengths() - [6] - - Concatenate Series with rechunk = False - - >>> pl.concat([s, s2], rechunk=False).chunk_lengths() - [3, 3] - - ''' - def n_chunks(self) -> int: - ''' - Get the number of chunks that this Series contains. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.n_chunks() - 1 - >>> s2 = pl.Series("a", [4, 5, 6]) - - Concatenate Series with rechunk = True - - >>> pl.concat([s, s2]).n_chunks() - 1 - - Concatenate Series with rechunk = False - - >>> pl.concat([s, s2], rechunk=False).n_chunks() - 2 - - ''' - def cum_max(self) -> Series: - ''' - Get an array with the cumulative max computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Examples - -------- - >>> s = pl.Series("s", [3, 5, 1]) - >>> s.cum_max() - shape: (3,) - Series: \'s\' [i64] - [ - 3 - 5 - 5 - ] - - ''' - def cum_min(self) -> Series: - ''' - Get an array with the cumulative min computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Examples - -------- - >>> s = pl.Series("s", [1, 2, 3]) - >>> s.cum_min() - shape: (3,) - Series: \'s\' [i64] - [ - 1 - 1 - 1 - ] - - ''' - def cum_prod(self) -> Series: - ''' - Get an array with the cumulative product computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.cum_prod() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 6 - ] - - ''' - def cum_sum(self) -> Series: - ''' - Get an array with the cumulative sum computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.cum_sum() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 3 - 6 - ] - - ''' - def slice(self, offset: int, length: int | None = ...) -> Series: - ''' - Get a slice of this Series. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4]) - >>> s.slice(1, 2) - shape: (2,) - Series: \'a\' [i64] - [ - 2 - 3 - ] - - ''' - def append(self, other: Series) -> Self: - ''' - Append a Series to this one. - - Parameters - ---------- - other - Series to append. - append_chunks - .. deprecated:: 0.18.8 - This argument will be removed and `append` will change to always - behave like `append_chunks=True` (the previous default). For the - behavior of `append_chunks=False`, use `Series.extend`. - - If set to `True` the append operation will add the chunks from `other` to - self. This is super cheap. - - If set to `False` the append operation will do the same as - `DataFrame.extend` which extends the memory backed by this `Series` with - the values from `other`. - - Different from `append chunks`, `extend` appends the data from `other` to - the underlying memory locations and thus may cause a reallocation (which are - expensive). - - If this does not cause a reallocation, the resulting data structure will not - have any extra chunks and thus will yield faster queries. - - Prefer `extend` over `append_chunks` when you want to do a query after a - single append. For instance during online operations where you add `n` rows - and rerun a query. - - Prefer `append_chunks` over `extend` when you want to append many times - before doing a query. For instance when you read in multiple files and when - to store them in a single `Series`. In the latter case, finish the sequence - of `append_chunks` operations with a `rechunk`. - - Warnings - -------- - This method modifies the series in-place. The series is returned for - convenience only. - - See Also - -------- - extend - - Examples - -------- - >>> a = pl.Series("a", [1, 2, 3]) - >>> b = pl.Series("b", [4, 5]) - >>> a.append(b) - shape: (5,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - 5 - ] - - The resulting series will consist of multiple chunks. - - >>> a.n_chunks() - 2 - - ''' - def extend(self, other: Series) -> Self: - ''' - Extend the memory backed by this Series with the values from another. - - Different from `append`, which adds the chunks from `other` to the chunks of - this series, `extend` appends the data from `other` to the underlying memory - locations and thus may cause a reallocation (which is expensive). - - If this does `not` cause a reallocation, the resulting data structure will not - have any extra chunks and thus will yield faster queries. - - Prefer `extend` over `append` when you want to do a query after a single - append. For instance, during online operations where you add `n` rows - and rerun a query. - - Prefer `append` over `extend` when you want to append many times - before doing a query. For instance, when you read in multiple files and want - to store them in a single `Series`. In the latter case, finish the sequence - of `append` operations with a `rechunk`. - - Parameters - ---------- - other - Series to extend the series with. - - Warnings - -------- - This method modifies the series in-place. The series is returned for - convenience only. - - See Also - -------- - append - - Examples - -------- - >>> a = pl.Series("a", [1, 2, 3]) - >>> b = pl.Series("b", [4, 5]) - >>> a.extend(b) - shape: (5,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - 5 - ] - - The resulting series will consist of a single chunk. - - >>> a.n_chunks() - 1 - - ''' - def filter(self, predicate: Series | list[bool]) -> Self: - ''' - Filter elements by a boolean mask. - - The original order of the remaining elements is preserved. - - Parameters - ---------- - predicate - Boolean mask. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> mask = pl.Series("", [True, False, True]) - >>> s.filter(mask) - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 3 - ] - - ''' - def head(self, n: int = ...) -> Series: - ''' - Get the first `n` elements. - - Parameters - ---------- - n - Number of elements to return. If a negative value is passed, return all - elements except the last `abs(n)`. - - See Also - -------- - tail, slice - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.head(3) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - Pass a negative value to get all rows `except` the last `abs(n)`. - - >>> s.head(-3) - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 2 - ] - - ''' - def tail(self, n: int = ...) -> Series: - ''' - Get the last `n` elements. - - Parameters - ---------- - n - Number of elements to return. If a negative value is passed, return all - elements except the first `abs(n)`. - - See Also - -------- - head, slice - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.tail(3) - shape: (3,) - Series: \'a\' [i64] - [ - 3 - 4 - 5 - ] - - Pass a negative value to get all rows `except` the first `abs(n)`. - - >>> s.tail(-3) - shape: (2,) - Series: \'a\' [i64] - [ - 4 - 5 - ] - - ''' - def limit(self, n: int = ...) -> Series: - """ - Get the first `n` elements. - - Alias for :func:`Series.head`. - - Parameters - ---------- - n - Number of elements to return. If a negative value is passed, return all - elements except the last `abs(n)`. - - See Also - -------- - head - - """ - def gather_every(self, n: int) -> Series: - ''' - Take every nth value in the Series and return as new Series. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4]) - >>> s.gather_every(2) - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 3 - ] - - ''' - def sort(self) -> Self: - ''' - Sort this Series. - - Parameters - ---------- - descending - Sort in descending order. - in_place - Sort in-place. - - Examples - -------- - >>> s = pl.Series("a", [1, 3, 4, 2]) - >>> s.sort() - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - ] - >>> s.sort(descending=True) - shape: (4,) - Series: \'a\' [i64] - [ - 4 - 3 - 2 - 1 - ] - - ''' - def top_k(self, k: int | IntoExprColumn = ...) -> Series: - ''' - Return the `k` largest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - bottom_k - - Examples - -------- - >>> s = pl.Series("a", [2, 5, 1, 4, 3]) - >>> s.top_k(3) - shape: (3,) - Series: \'a\' [i64] - [ - 5 - 4 - 3 - ] - - ''' - def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: - ''' - Return the `k` smallest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - top_k - - Examples - -------- - >>> s = pl.Series("a", [2, 5, 1, 4, 3]) - >>> s.bottom_k(3) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def arg_sort(self) -> Series: - ''' - Get the index values that would sort this Series. - - Parameters - ---------- - descending - Sort in descending order. - nulls_last - Place null values last instead of first. - - Examples - -------- - >>> s = pl.Series("a", [5, 3, 4, 1, 2]) - >>> s.arg_sort() - shape: (5,) - Series: \'a\' [u32] - [ - 3 - 4 - 1 - 2 - 0 - ] - - ''' - def arg_unique(self) -> Series: - ''' - Get unique index as Series. - - Returns - ------- - Series - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.arg_unique() - shape: (3,) - Series: \'a\' [u32] - [ - 0 - 1 - 3 - ] - - ''' - def arg_min(self) -> int | None: - ''' - Get the index of the minimal value. - - Returns - ------- - int - - Examples - -------- - >>> s = pl.Series("a", [3, 2, 1]) - >>> s.arg_min() - 2 - - ''' - def arg_max(self) -> int | None: - ''' - Get the index of the maximal value. - - Returns - ------- - int - - Examples - -------- - >>> s = pl.Series("a", [3, 2, 1]) - >>> s.arg_max() - 0 - - ''' - def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: - """ - Find indices where elements should be inserted to maintain order. - - .. math:: a[i-1] < v <= a[i] - - Parameters - ---------- - element - Expression or scalar value. - side : {'any', 'left', 'right'} - If 'any', the index of the first suitable location found is given. - If 'left', the index of the leftmost suitable location found is given. - If 'right', return the rightmost suitable location found is given. - - """ - def unique(self) -> Series: - ''' - Get unique elements in series. - - Parameters - ---------- - maintain_order - Maintain order of data. This requires more work. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.unique().sort() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: - ''' - Take values by index. - - Parameters - ---------- - indices - Index location used for selection. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4]) - >>> s.gather([1, 3]) - shape: (2,) - Series: \'a\' [i64] - [ - 2 - 4 - ] - - ''' - def null_count(self) -> int: - """Count the null values in this Series.""" - def has_validity(self) -> bool: - """ - Return True if the Series has a validity bitmask. - - If there is no mask, it means that there are no `null` values. - - Notes - ----- - While the *absence* of a validity bitmask guarantees that a Series does not - have `null` values, the converse is not true, eg: the *presence* of a - bitmask does not mean that there are null values, as every value of the - bitmask could be `false`. - - To confirm that a column has `null` values use :func:`null_count`. - - """ - def is_empty(self) -> bool: - ''' - Check if the Series is empty. - - Examples - -------- - >>> s = pl.Series("a", [], dtype=pl.Float32) - >>> s.is_empty() - True - - ''' - def is_sorted(self) -> bool: - """ - Check if the Series is sorted. - - Parameters - ---------- - descending - Check if the Series is sorted in descending order - - """ - def not_(self) -> Series: - ''' - Negate a boolean Series. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [True, False, False]) - >>> s.not_() - shape: (3,) - Series: \'a\' [bool] - [ - false - true - true - ] - - ''' - def is_null(self) -> Series: - ''' - Returns a boolean Series indicating which values are null. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) - >>> s.is_null() - shape: (4,) - Series: \'a\' [bool] - [ - false - false - false - true - ] - - ''' - def is_not_null(self) -> Series: - ''' - Returns a boolean Series indicating which values are not null. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) - >>> s.is_not_null() - shape: (4,) - Series: \'a\' [bool] - [ - true - true - true - false - ] - - ''' - def is_finite(self) -> Series: - ''' - Returns a boolean Series indicating which values are finite. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, np.inf]) - >>> s.is_finite() - shape: (3,) - Series: \'a\' [bool] - [ - true - true - false - ] - - ''' - def is_infinite(self) -> Series: - ''' - Returns a boolean Series indicating which values are infinite. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, np.inf]) - >>> s.is_infinite() - shape: (3,) - Series: \'a\' [bool] - [ - false - false - true - ] - - ''' - def is_nan(self) -> Series: - ''' - Returns a boolean Series indicating which values are not NaN. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) - >>> s.is_nan() - shape: (4,) - Series: \'a\' [bool] - [ - false - false - false - true - ] - - ''' - def is_not_nan(self) -> Series: - ''' - Returns a boolean Series indicating which values are not NaN. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) - >>> s.is_not_nan() - shape: (4,) - Series: \'a\' [bool] - [ - true - true - true - false - ] - - ''' - def is_in(self, other: Series | Collection[Any]) -> Series: - ''' - Check if elements of this Series are in the other Series. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("b", [2, 4]) - >>> s2.is_in(s) - shape: (2,) - Series: \'b\' [bool] - [ - true - false - ] - - >>> # check if some values are a member of sublists - >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) - >>> optional_members = pl.Series("optional_members", [1, 2, 3]) - >>> print(sets) - shape: (3,) - Series: \'sets\' [list[i64]] - [ - [1, 2, 3] - [1, 2] - [9, 10] - ] - >>> print(optional_members) - shape: (3,) - Series: \'optional_members\' [i64] - [ - 1 - 2 - 3 - ] - >>> optional_members.is_in(sets) - shape: (3,) - Series: \'optional_members\' [bool] - [ - true - true - false - ] - - ''' - def arg_true(self) -> Series: - ''' - Get index values where Boolean Series evaluate True. - - Returns - ------- - Series - Series of data type :class:`UInt32`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> (s == 2).arg_true() - shape: (1,) - Series: \'a\' [u32] - [ - 1 - ] - - ''' - def is_unique(self) -> Series: - ''' - Get mask of all unique values. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.is_unique() - shape: (4,) - Series: \'a\' [bool] - [ - true - false - false - true - ] - - ''' - def is_first_distinct(self) -> Series: - """ - Return a boolean mask indicating the first occurrence of each distinct value. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series([1, 1, 2, 3, 2]) - >>> s.is_first_distinct() - shape: (5,) - Series: '' [bool] - [ - true - false - true - true - false - ] - - """ - def is_last_distinct(self) -> Series: - """ - Return a boolean mask indicating the last occurrence of each distinct value. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series([1, 1, 2, 3, 2]) - >>> s.is_last_distinct() - shape: (5,) - Series: '' [bool] - [ - false - true - false - true - true - ] - - """ - def is_duplicated(self) -> Series: - ''' - Get mask of all duplicated values. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.is_duplicated() - shape: (4,) - Series: \'a\' [bool] - [ - false - true - true - false - ] - - ''' - def explode(self) -> Series: - """ - Explode a list Series. - - This means that every item is expanded to a new row. - - Returns - ------- - Series - Series with the data type of the list elements. - - See Also - -------- - Series.list.explode : Explode a list column. - Series.str.explode : Explode a string column. - - """ - def equals(self, other: Series) -> bool: - ''' - Check whether the Series is equal to another Series. - - Parameters - ---------- - other - Series to compare with. - null_equal - Consider null values as equal. - strict - Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a - `pl.Int64` will return `False`. - - See Also - -------- - assert_series_equal - - Examples - -------- - >>> s1 = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("b", [4, 5, 6]) - >>> s1.equals(s1) - True - >>> s1.equals(s2) - False - ''' - def len(self) -> int: - ''' - Return the number of elements in this Series. - - Null values are treated like regular elements in this context. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, None]) - >>> s.len() - 3 - - ''' - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: - ''' - Cast between data types. - - Parameters - ---------- - dtype - DataType to cast to. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> s = pl.Series("a", [True, False, True]) - >>> s - shape: (3,) - Series: \'a\' [bool] - [ - true - false - true - ] - - >>> s.cast(pl.UInt32) - shape: (3,) - Series: \'a\' [u32] - [ - 1 - 0 - 1 - ] - - ''' - def to_physical(self) -> Series: - ''' - Cast to physical representation of the logical dtype. - - - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` - - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` - - `List(inner)` -> `List(physical of inner)` - - Other data types will be left unchanged. - - Examples - -------- - Replicating the pandas - `pd.Series.factorize - `_ - method. - - >>> s = pl.Series("values", ["a", None, "x", "a"]) - >>> s.cast(pl.Categorical).to_physical() - shape: (4,) - Series: \'values\' [u32] - [ - 0 - null - 1 - 0 - ] - - ''' - def to_list(self) -> list[Any]: - ''' - Convert this Series to a Python List. This operation clones data. - - Parameters - ---------- - use_pyarrow - Use pyarrow for the conversion. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.to_list() - [1, 2, 3] - >>> type(s.to_list()) - - - ''' - def rechunk(self) -> Self: - """ - Create a single chunk of memory for this Series. - - Parameters - ---------- - in_place - In place or not. - - """ - def reverse(self) -> Series: - ''' - Return Series in reverse order. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) - >>> s.reverse() - shape: (3,) - Series: \'a\' [i8] - [ - 3 - 2 - 1 - ] - - ''' - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: - ''' - Get a boolean mask of the values that fall between the given start/end values. - - Parameters - ---------- - lower_bound - Lower bound value. Accepts expression input. Non-expression inputs - (including strings) are parsed as literals. - upper_bound - Upper bound value. Accepts expression input. Non-expression inputs - (including strings) are parsed as literals. - closed : {\'both\', \'left\', \'right\', \'none\'} - Define which sides of the interval are closed (inclusive). - - Examples - -------- - >>> s = pl.Series("num", [1, 2, 3, 4, 5]) - >>> s.is_between(2, 4) - shape: (5,) - Series: \'num\' [bool] - [ - false - true - true - true - false - ] - - Use the `closed` argument to include or exclude the values at the bounds: - - >>> s.is_between(2, 4, closed="left") - shape: (5,) - Series: \'num\' [bool] - [ - false - true - true - false - false - ] - - You can also use strings as well as numeric/temporal values: - - >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) - >>> s.is_between("b", "d", closed="both") - shape: (5,) - Series: \'s\' [bool] - [ - false - true - true - true - false - ] - - ''' - def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: - ''' - Convert this Series to numpy. - - This operation may clone data but is completely safe. Note that: - - - data which is purely numeric AND without null values is not cloned; - - floating point `nan` values can be zero-copied; - - booleans can\'t be zero-copied. - - To ensure that no data is cloned, set `zero_copy_only=True`. - - Parameters - ---------- - *args - args will be sent to pyarrow.Array.to_numpy. - zero_copy_only - If True, an exception will be raised if the conversion to a numpy - array would require copying the underlying data (e.g. in presence - of nulls, or for non-primitive types). - writable - For numpy arrays created with zero copy (view on the Arrow data), - the resulting array is not writable (Arrow data is immutable). - By setting this to True, a copy of the array is made to ensure - it is writable. - use_pyarrow - Use `pyarrow.Array.to_numpy - `_ - - for the conversion to numpy. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> arr = s.to_numpy() - >>> arr # doctest: +IGNORE_RESULT - array([1, 2, 3], dtype=int64) - >>> type(arr) - - - ''' - def _view(self) -> SeriesView: - ''' - Get a view into this Series data with a numpy array. - - This operation doesn\'t clone data, but does not include missing values. - - Returns - ------- - SeriesView - - Parameters - ---------- - ignore_nulls - If True then nulls are converted to 0. - If False then an Exception is raised if nulls are present. - - Examples - -------- - >>> s = pl.Series("a", [1, None]) - >>> s._view(ignore_nulls=True) - SeriesView([1, 0]) - - ''' - def to_arrow(self) -> pa.Array: - ''' - Get the underlying Arrow Array. - - If the Series contains only a single chunk this operation is zero copy. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s = s.to_arrow() - >>> s # doctest: +ELLIPSIS - - [ - 1, - 2, - 3 - ] - - ''' - def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: - ''' - Convert this Series to a pandas Series. - - This requires that :mod:`pandas` and :mod:`pyarrow` are installed. - This operation clones data, unless `use_pyarrow_extension_array=True`. - - Parameters - ---------- - use_pyarrow_extension_array - Further operations on this Pandas series, might trigger conversion to numpy. - Use PyArrow backed-extension array instead of numpy array for pandas - Series. This allows zero copy operations and preservation of nulls - values. - Further operations on this pandas Series, might trigger conversion - to NumPy arrays if that operation is not supported by pyarrow compute - functions. - kwargs - Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. - - Examples - -------- - >>> s1 = pl.Series("a", [1, 2, 3]) - >>> s1.to_pandas() - 0 1 - 1 2 - 2 3 - Name: a, dtype: int64 - >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP - 0 1 - 1 2 - 2 3 - Name: a, dtype: int64[pyarrow] - >>> s2 = pl.Series("b", [1, 2, None, 4]) - >>> s2.to_pandas() - 0 1.0 - 1 2.0 - 2 NaN - 3 4.0 - Name: b, dtype: float64 - >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP - 0 1 - 1 2 - 2 - 3 4 - Name: b, dtype: int64[pyarrow] - - ''' - def to_init_repr(self, n: int = ...) -> str: - ''' - Convert Series to instantiatable string representation. - - Parameters - ---------- - n - Only use first n elements. - - See Also - -------- - polars.Series.to_init_repr - polars.from_repr - - Examples - -------- - >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) - >>> print(s.to_init_repr()) - pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) - >>> s_from_str_repr = eval(s.to_init_repr()) - >>> s_from_str_repr - shape: (4,) - Series: \'a\' [i16] - [ - 1 - 2 - null - 4 - ] - - ''' - def set(self, filter: Series, value: int | float | str | bool | None) -> Series: - ''' - Set masked values. - - Parameters - ---------- - filter - Boolean mask. - value - Value with which to replace the masked values. - - Notes - ----- - Use of this function is frequently an anti-pattern, as it can - block optimisation (predicate pushdown, etc). Consider using - `pl.when(predicate).then(value).otherwise(self)` instead. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.set(s == 2, 10) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 10 - 3 - ] - - It is better to implement this as follows: - - >>> s.to_frame().select( - ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) - ... ) - shape: (3, 1) - ┌─────────┐ - │ literal │ - │ --- │ - │ i64 │ - ╞═════════╡ - │ 1 │ - │ 10 │ - │ 3 │ - └─────────┘ - - ''' - def scatter(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: - ''' - Set values at the index locations. - - Parameters - ---------- - indices - Integers representing the index locations. - values - Replacement values. - - Notes - ----- - Use of this function is frequently an anti-pattern, as it can - block optimization (predicate pushdown, etc). Consider using - `pl.when(predicate).then(value).otherwise(self)` instead. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.scatter(1, 10) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 10 - 3 - ] - - It is better to implement this as follows: - - >>> s.to_frame().with_row_count("row_nr").select( - ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) - ... ) - shape: (3, 1) - ┌─────────┐ - │ literal │ - │ --- │ - │ i64 │ - ╞═════════╡ - │ 1 │ - │ 10 │ - │ 3 │ - └─────────┘ - - ''' - def clear(self, n: int = ...) -> Series: - ''' - Create an empty copy of the current Series, with zero to \'n\' elements. - - The copy has an identical name/dtype, but no data. - - Parameters - ---------- - n - Number of (empty) elements to return in the cleared frame. - - See Also - -------- - clone : Cheap deepcopy/clone. - - Examples - -------- - >>> s = pl.Series("a", [None, True, False]) - >>> s.clear() - shape: (0,) - Series: \'a\' [bool] - [ - ] - - >>> s.clear(n=2) - shape: (2,) - Series: \'a\' [bool] - [ - null - null - ] - - ''' - def clone(self) -> Self: - ''' - Create a copy of this Series. - - This is a cheap operation that does not copy data. - - See Also - -------- - clear : Create an empty copy of the current Series, with identical - schema but no data. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.clone() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def fill_nan(self, value: int | float | Expr | None) -> Series: - ''' - Fill floating point NaN value with a fill value. - - Parameters - ---------- - value - Value used to fill NaN values. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, float("nan")]) - >>> s.fill_nan(0) - shape: (4,) - Series: \'a\' [f64] - [ - 1.0 - 2.0 - 3.0 - 0.0 - ] - - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: - ''' - Fill null values using the specified value or strategy. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, None]) - >>> s.fill_null(strategy="forward") - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 3 - ] - >>> s.fill_null(strategy="min") - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 1 - ] - >>> s = pl.Series("b", ["x", None, "z"]) - >>> s.fill_null(pl.lit("")) - shape: (3,) - Series: \'b\' [str] - [ - "x" - "" - "z" - ] - - ''' - def floor(self) -> Series: - ''' - Rounds down to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) - >>> s.floor() - shape: (3,) - Series: \'a\' [f64] - [ - 1.0 - 2.0 - 3.0 - ] - - ''' - def ceil(self) -> Series: - ''' - Rounds up to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) - >>> s.ceil() - shape: (3,) - Series: \'a\' [f64] - [ - 2.0 - 3.0 - 4.0 - ] - - ''' - def round(self, decimals: int = ...) -> Series: - ''' - Round underlying floating point data by `decimals` digits. - - Examples - -------- - >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) - >>> s.round(2) - shape: (3,) - Series: \'a\' [f64] - [ - 1.12 - 2.57 - 3.9 - ] - - Parameters - ---------- - decimals - number of decimals to round by. - - ''' - def round_sig_figs(self, digits: int) -> Series: - """ - Round to a number of significant figures. - - Parameters - ---------- - digits - Number of significant figures to round to. - - Examples - -------- - >>> s = pl.Series([0.01234, 3.333, 1234.0]) - >>> s.round_sig_figs(2) - shape: (3,) - Series: '' [f64] - [ - 0.012 - 3.3 - 1200.0 - ] - - """ - def dot(self, other: Series | ArrayLike) -> float | None: - ''' - Compute the dot/inner product between two Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) - >>> s.dot(s2) - 32.0 - - Parameters - ---------- - other - Series (or array) to compute dot product with. - - ''' - def mode(self) -> Series: - ''' - Compute the most occurring value(s). - - Can return multiple Values. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.mode() - shape: (1,) - Series: \'a\' [i64] - [ - 2 - ] - - ''' - def sign(self) -> Series: - ''' - Compute the element-wise indication of the sign. - - The returned values can be -1, 0, or 1: - - * -1 if x < 0. - * 0 if x == 0. - * 1 if x > 0. - - (null values are preserved as-is). - - Examples - -------- - >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) - >>> s.sign() - shape: (5,) - Series: \'a\' [i64] - [ - -1 - 0 - 0 - 1 - null - ] - - ''' - def sin(self) -> Series: - ''' - Compute the element-wise value for the sine. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.sin() - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 1.0 - 1.2246e-16 - ] - - ''' - def cos(self) -> Series: - ''' - Compute the element-wise value for the cosine. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.cos() - shape: (3,) - Series: \'a\' [f64] - [ - 1.0 - 6.1232e-17 - -1.0 - ] - - ''' - def tan(self) -> Series: - ''' - Compute the element-wise value for the tangent. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.tan() - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 1.6331e16 - -1.2246e-16 - ] - - ''' - def cot(self) -> Series: - ''' - Compute the element-wise value for the cotangent. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.cot() - shape: (3,) - Series: \'a\' [f64] - [ - inf - 6.1232e-17 - -8.1656e15 - ] - - ''' - def arcsin(self) -> Series: - ''' - Compute the element-wise value for the inverse sine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arcsin() - shape: (3,) - Series: \'a\' [f64] - [ - 1.570796 - 0.0 - -1.570796 - ] - - ''' - def arccos(self) -> Series: - ''' - Compute the element-wise value for the inverse cosine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arccos() - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 1.570796 - 3.141593 - ] - - ''' - def arctan(self) -> Series: - ''' - Compute the element-wise value for the inverse tangent. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arctan() - shape: (3,) - Series: \'a\' [f64] - [ - 0.785398 - 0.0 - -0.785398 - ] - - ''' - def arcsinh(self) -> Series: - ''' - Compute the element-wise value for the inverse hyperbolic sine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arcsinh() - shape: (3,) - Series: \'a\' [f64] - [ - 0.881374 - 0.0 - -0.881374 - ] - - ''' - def arccosh(self) -> Series: - ''' - Compute the element-wise value for the inverse hyperbolic cosine. - - Examples - -------- - >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) - >>> s.arccosh() - shape: (4,) - Series: \'a\' [f64] - [ - 2.292432 - 0.0 - NaN - NaN - ] - - ''' - def arctanh(self) -> Series: - ''' - Compute the element-wise value for the inverse hyperbolic tangent. - - Examples - -------- - >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) - >>> s.arctanh() - shape: (7,) - Series: \'a\' [f64] - [ - NaN - inf - 0.549306 - 0.0 - -0.549306 - -inf - NaN - ] - - ''' - def sinh(self) -> Series: - ''' - Compute the element-wise value for the hyperbolic sine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.sinh() - shape: (3,) - Series: \'a\' [f64] - [ - 1.175201 - 0.0 - -1.175201 - ] - - ''' - def cosh(self) -> Series: - ''' - Compute the element-wise value for the hyperbolic cosine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.cosh() - shape: (3,) - Series: \'a\' [f64] - [ - 1.543081 - 1.0 - 1.543081 - ] - - ''' - def tanh(self) -> Series: - ''' - Compute the element-wise value for the hyperbolic tangent. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.tanh() - shape: (3,) - Series: \'a\' [f64] - [ - 0.761594 - 0.0 - -0.761594 - ] - - ''' - def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - ''' - Map a custom/user-defined function (UDF) over elements in this Series. - - .. warning:: - This method is much slower than the native expressions API. - Only use it if you cannot implement your logic otherwise. - - If the function returns a different datatype, the return_dtype arg should - be set, otherwise the method will fail. - - Implementing logic using a Python function is almost always *significantly* - slower and more memory intensive than implementing the same logic using - the native expression API because: - - - The native expression engine runs in Rust; UDFs run in Python. - - Use of Python UDFs forces the DataFrame to be materialized in memory. - - Polars-native expressions can be parallelised (UDFs typically cannot). - - Polars-native expressions can be logically optimised (UDFs cannot). - - Wherever possible you should strongly prefer the native expression API - to achieve the best performance. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output datatype. If none is given, the same datatype as this Series will be - used. - skip_nulls - Nulls will be skipped and not passed to the python function. - This is faster because python can be skipped and because we call - more specialized functions. - - Warnings - -------- - If `return_dtype` is not provided, this may lead to unexpected results. - We allow this, but it is considered a bug in the user\'s query. - - Notes - ----- - If your function is expensive and you don\'t want it to be called more than - once for a given input, consider applying an `@lru_cache` decorator to it. - If your data is suitable you may achieve *significant* speedups. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP - shape: (3,) - Series: \'a\' [i64] - [ - 11 - 12 - 13 - ] - - Returns - ------- - Series - - ''' - def shift(self, n: int = ...) -> Series: - """ - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. Accepts expression input. - Non-expression inputs are parsed as literals. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> s = pl.Series([1, 2, 3, 4]) - >>> s.shift() - shape: (4,) - Series: '' [i64] - [ - null - 1 - 2 - 3 - ] - - Pass a negative value to shift in the opposite direction instead. - - >>> s.shift(-2) - shape: (4,) - Series: '' [i64] - [ - 3 - 4 - null - null - ] - - Specify `fill_value` to fill the resulting null values. - - >>> s.shift(-2, fill_value=100) - shape: (4,) - Series: '' [i64] - [ - 3 - 4 - 100 - 100 - ] - - """ - def zip_with(self, mask: Series, other: Series) -> Self: - """ - Take values from self or other based on the given mask. - - Where mask evaluates true, take values from self. Where mask evaluates false, - take values from other. - - Parameters - ---------- - mask - Boolean Series. - other - Series of same type. - - Returns - ------- - Series - - Examples - -------- - >>> s1 = pl.Series([1, 2, 3, 4, 5]) - >>> s2 = pl.Series([5, 4, 3, 2, 1]) - >>> s1.zip_with(s1 < s2, s2) - shape: (5,) - Series: '' [i64] - [ - 1 - 2 - 3 - 2 - 1 - ] - >>> mask = pl.Series([True, False, True, False, True]) - >>> s1.zip_with(mask, s2) - shape: (5,) - Series: '' [i64] - [ - 1 - 4 - 3 - 2 - 5 - ] - - """ - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling min (moving min) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their min. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [100, 200, 300, 400, 500]) - >>> s.rolling_min(window_size=3) - shape: (5,) - Series: \'a\' [i64] - [ - null - null - 100 - 200 - 300 - ] - - ''' - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling max (moving max) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their max. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [100, 200, 300, 400, 500]) - >>> s.rolling_max(window_size=2) - shape: (5,) - Series: \'a\' [i64] - [ - null - 200 - 300 - 400 - 500 - ] - - ''' - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling mean (moving mean) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their mean. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [100, 200, 300, 400, 500]) - >>> s.rolling_mean(window_size=2) - shape: (5,) - Series: \'a\' [f64] - [ - null - 150.0 - 250.0 - 350.0 - 450.0 - ] - - ''' - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling sum (moving sum) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their sum. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length of the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.rolling_sum(window_size=2) - shape: (5,) - Series: \'a\' [i64] - [ - null - 3 - 5 - 7 - 9 - ] - - ''' - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling std dev. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their std dev. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_std(window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.0 - 1.0 - 1.527525 - 2.0 - ] - - ''' - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling variance. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their variance. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_var(window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.0 - 1.0 - 2.333333 - 4.0 - ] - - ''' - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a custom rolling window function. - - .. warning:: - Computing custom functions is extremely slow. Use specialized rolling - functions such as :func:`Series.rolling_sum` if at all possible. - - Parameters - ---------- - function - Custom aggregation function. - window_size - Size of the window. The window at a given row will include the row - itself and the `window_size - 1` elements before it. - weights - A list of weights with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window. - - Warnings - -------- - - - Examples - -------- - >>> from numpy import nansum - >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) - >>> s.rolling_map(nansum, window_size=3) - shape: (5,) - Series: \'\' [f64] - [ - null - null - 22.0 - 11.0 - 17.0 - ] - - ''' - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling median. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_median(window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 2.0 - 3.0 - 4.0 - 6.0 - ] - - ''' - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling quantile. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_quantile(quantile=0.33, window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.0 - 2.0 - 3.0 - 4.0 - ] - >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.66 - 2.66 - 3.66 - 5.32 - ] - - ''' - def rolling_skew(self, window_size: int) -> Series: - """ - Compute a rolling skew. - - The window at a given row includes the row itself and the - `window_size - 1` elements before it. - - Parameters - ---------- - window_size - Integer size of the rolling window. - bias - If False, the calculations are corrected for statistical bias. - - Examples - -------- - >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) - shape: (4,) - Series: '' [f64] - [ - null - null - 0.381802 - 0.47033 - ] - - Note how the values match - - >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() - (0.38180177416060584, 0.47033046033698594) - - """ - def sample(self, n: int | None = ...) -> Series: - ''' - Sample from this Series. - - Parameters - ---------- - n - Number of items to return. Cannot be used with `fraction`. Defaults to 1 if - `fraction` is None. - fraction - Fraction of items to return. Cannot be used with `n`. - with_replacement - Allow values to be sampled more than once. - shuffle - Shuffle the order of sampled data points. - seed - Seed for the random number generator. If set to None (default), a - random seed is generated for each sample operation. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 5 - ] - - ''' - def peak_max(self) -> Self: - ''' - Get a boolean mask of the local maximum peaks. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.peak_max() - shape: (5,) - Series: \'a\' [bool] - [ - false - false - false - false - true - ] - - ''' - def peak_min(self) -> Self: - ''' - Get a boolean mask of the local minimum peaks. - - Examples - -------- - >>> s = pl.Series("a", [4, 1, 3, 2, 5]) - >>> s.peak_min() - shape: (5,) - Series: \'a\' [bool] - [ - false - true - false - true - false - ] - - ''' - def n_unique(self) -> int: - ''' - Count the number of unique values in this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.n_unique() - 3 - - ''' - def shrink_to_fit(self) -> Series: - """ - Shrink Series memory usage. - - Shrinks the underlying array capacity to exactly fit the actual data. - (Note that this function does not change the Series data type). - - """ - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: - ''' - Hash the Series. - - The hash value is of type `UInt64`. - - Parameters - ---------- - seed - Random seed parameter. Defaults to 0. - seed_1 - Random seed parameter. Defaults to `seed` if not set. - seed_2 - Random seed parameter. Defaults to `seed` if not set. - seed_3 - Random seed parameter. Defaults to `seed` if not set. - - Notes - ----- - This implementation of :func:`hash` does not guarantee stable results - across different Polars versions. Its stability is only guaranteed within a - single version. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.hash(seed=42) # doctest: +IGNORE_RESULT - shape: (3,) - Series: \'a\' [u64] - [ - 10734580197236529959 - 3022416320763508302 - 13756996518000038261 - ] - - ''' - def reinterpret(self) -> Series: - """ - Reinterpret the underlying bits as a signed/unsigned integer. - - This operation is only allowed for 64bit integers. For lower bits integers, - you can safely use that cast operation. - - Parameters - ---------- - signed - If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. - - """ - def interpolate(self, method: InterpolationMethod = ...) -> Series: - ''' - Fill null values using interpolation. - - Parameters - ---------- - method : {\'linear\', \'nearest\'} - Interpolation method. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, None, None, 5]) - >>> s.interpolate() - shape: (5,) - Series: \'a\' [f64] - [ - 1.0 - 2.0 - 3.0 - 4.0 - 5.0 - ] - - ''' - def abs(self) -> Series: - """ - Compute absolute values. - - Same as `abs(series)`. - """ - def rank(self, method: RankMethod = ...) -> Series: - ''' - Assign ranks to data, dealing with ties appropriately. - - Parameters - ---------- - method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} - The method used to assign ranks to tied elements. - The following methods are available (default is \'average\'): - - - \'average\' : The average of the ranks that would have been assigned to - all the tied values is assigned to each value. - - \'min\' : The minimum of the ranks that would have been assigned to all - the tied values is assigned to each value. (This is also referred to - as "competition" ranking.) - - \'max\' : The maximum of the ranks that would have been assigned to all - the tied values is assigned to each value. - - \'dense\' : Like \'min\', but the rank of the next highest element is - assigned the rank immediately after those assigned to the tied - elements. - - \'ordinal\' : All values are given a distinct rank, corresponding to - the order that the values occur in the Series. - - \'random\' : Like \'ordinal\', but the rank for ties is not dependent - on the order that the values occur in the Series. - descending - Rank in descending order. - seed - If `method="random"`, use this as seed. - - Examples - -------- - The \'average\' method: - - >>> s = pl.Series("a", [3, 6, 1, 1, 6]) - >>> s.rank() - shape: (5,) - Series: \'a\' [f64] - [ - 3.0 - 4.5 - 1.5 - 1.5 - 4.5 - ] - - The \'ordinal\' method: - - >>> s = pl.Series("a", [3, 6, 1, 1, 6]) - >>> s.rank("ordinal") - shape: (5,) - Series: \'a\' [u32] - [ - 3 - 4 - 1 - 2 - 5 - ] - - ''' - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: - ''' - Calculate the first discrete difference between shifted items. - - Parameters - ---------- - n - Number of slots to shift. - null_behavior : {\'ignore\', \'drop\'} - How to handle null values. - - Examples - -------- - >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) - >>> s.diff() - shape: (5,) - Series: \'s\' [i8] - [ - null - -10 - 20 - -5 - 10 - ] - - >>> s.diff(n=2) - shape: (5,) - Series: \'s\' [i8] - [ - null - null - 10 - 15 - 5 - ] - - >>> s.diff(n=2, null_behavior="drop") - shape: (3,) - Series: \'s\' [i8] - [ - 10 - 15 - 5 - ] - - ''' - def pct_change(self, n: int | IntoExprColumn = ...) -> Series: - """ - Computes percentage change between values. - - Percentage change (as fraction) between current element and most-recent - non-null element at least `n` period(s) before the current element. - - Computes the change from the previous row by default. - - Parameters - ---------- - n - periods to shift for forming percent change. - - Examples - -------- - >>> pl.Series(range(10)).pct_change() - shape: (10,) - Series: '' [f64] - [ - null - inf - 1.0 - 0.5 - 0.333333 - 0.25 - 0.2 - 0.166667 - 0.142857 - 0.125 - ] - - >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) - shape: (10,) - Series: '' [f64] - [ - null - null - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - ] - - """ - def skew(self) -> float | None: - """ - Compute the sample skewness of a data set. - - For normally distributed data, the skewness should be about zero. For - unimodal continuous distributions, a skewness value greater than zero means - that there is more weight in the right tail of the distribution. The - function `skewtest` can be used to determine if the skewness value - is close enough to zero, statistically speaking. - - - See scipy.stats for more information. - - Parameters - ---------- - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - Notes - ----- - The sample skewness is computed as the Fisher-Pearson coefficient - of skewness, i.e. - - .. math:: g_1=\\frac{m_3}{m_2^{3/2}} - - where - - .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i - - is the biased sample :math:`i\\texttt{th}` central moment, and - :math:`\\bar{x}` is - the sample mean. If `bias` is False, the calculations are - corrected for bias and the value computed is the adjusted - Fisher-Pearson standardized moment coefficient, i.e. - - .. math:: - G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} - - """ - def kurtosis(self) -> float | None: - """ - Compute the kurtosis (Fisher or Pearson) of a dataset. - - Kurtosis is the fourth central moment divided by the square of the - variance. If Fisher's definition is used, then 3.0 is subtracted from - the result to give 0.0 for a normal distribution. - If bias is False then the kurtosis is calculated using k statistics to - eliminate bias coming from biased moment estimators - - See scipy.stats for more information - - Parameters - ---------- - fisher : bool, optional - If True, Fisher's definition is used (normal ==> 0.0). If False, - Pearson's definition is used (normal ==> 3.0). - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - """ - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: - """ - Set values outside the given boundaries to the boundary value. - - Parameters - ---------- - lower_bound - Lower bound. Accepts expression input. - Non-expression inputs are parsed as literals. - If set to `None` (default), no lower bound is applied. - upper_bound - Upper bound. Accepts expression input. - Non-expression inputs are parsed as literals. - If set to `None` (default), no upper bound is applied. - - See Also - -------- - when - - Notes - ----- - This method only works for numeric and temporal columns. To clip other data - types, consider writing a `when-then-otherwise` expression. See :func:`when`. - - Examples - -------- - Specifying both a lower and upper bound: - - >>> s = pl.Series([-50, 5, 50, None]) - >>> s.clip(1, 10) - shape: (4,) - Series: '' [i64] - [ - 1 - 5 - 10 - null - ] - - Specifying only a single bound: - - >>> s.clip(upper_bound=10) - shape: (4,) - Series: '' [i64] - [ - -50 - 5 - 10 - null - ] - - """ - def lower_bound(self) -> Self: - ''' - Return the lower bound of this Series\' dtype as a unit Series. - - See Also - -------- - upper_bound : return the upper bound of the given Series\' dtype. - - Examples - -------- - >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) - >>> s.lower_bound() - shape: (1,) - Series: \'s\' [i32] - [ - -2147483648 - ] - - >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) - >>> s.lower_bound() - shape: (1,) - Series: \'s\' [f32] - [ - -inf - ] - - ''' - def upper_bound(self) -> Self: - ''' - Return the upper bound of this Series\' dtype as a unit Series. - - See Also - -------- - lower_bound : return the lower bound of the given Series\' dtype. - - Examples - -------- - >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) - >>> s.upper_bound() - shape: (1,) - Series: \'s\' [i8] - [ - 127 - ] - - >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) - >>> s.upper_bound() - shape: (1,) - Series: \'s\' [f64] - [ - inf - ] - - ''' - def replace(self, mapping: dict[Any, Any]) -> Self: - ''' - Replace values according to the given mapping. - - Needs a global string cache for lazily evaluated queries on columns of - type `Categorical`. - - Parameters - ---------- - mapping - Mapping of values to their replacement. - default - Value to use when the mapping does not contain the lookup value. - Defaults to keeping the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - - See Also - -------- - str.replace - - Examples - -------- - Replace a single value by another value. Values not in the mapping remain - unchanged. - - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.replace({2: 100}) - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 100 - 100 - 3 - ] - - Replace multiple values. Specify a default to set values not in the given map - to the default value. - - >>> s = pl.Series("country_code", ["FR", "ES", "DE", None]) - >>> country_code_map = { - ... "CA": "Canada", - ... "DE": "Germany", - ... "FR": "France", - ... None: "unspecified", - ... } - >>> s.replace(country_code_map, default=None) - shape: (4,) - Series: \'country_code\' [str] - [ - "France" - null - "Germany" - "unspecified" - ] - - The return type can be overridden with the `return_dtype` argument. - - >>> s = pl.Series("a", [0, 1, 2, 3]) - >>> s.replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) - shape: (4,) - Series: \'a\' [u8] - [ - 0 - 10 - 20 - 0 - ] - ''' - def reshape(self, dimensions: tuple[int, ...]) -> Series: - ''' - Reshape this Series to a flat Series or a Series of Lists. - - Parameters - ---------- - dimensions - Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that - dimension is inferred. - - Returns - ------- - Series - If a single dimension is given, results in a Series of the original - data type. - If a multiple dimensions are given, results in a Series of data type - :class:`List` with shape (rows, cols). - - See Also - -------- - Series.list.explode : Explode a list column. - - Examples - -------- - >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) - >>> s.reshape((3, 3)) - shape: (3,) - Series: \'foo\' [list[i64]] - [ - [1, 2, 3] - [4, 5, 6] - [7, 8, 9] - ] - - ''' - def shuffle(self, seed: int | None = ...) -> Series: - ''' - Shuffle the contents of this Series. - - Parameters - ---------- - seed - Seed for the random number generator. If set to None (default), a - random seed is generated each time the shuffle is called. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.shuffle(seed=1) - shape: (3,) - Series: \'a\' [i64] - [ - 2 - 1 - 3 - ] - - ''' - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: - """ - Exponentially-weighted moving average. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> s = pl.Series([1, 2, 3]) - >>> s.ewm_mean(com=1) - shape: (3,) - Series: '' [f64] - [ - 1.0 - 1.666667 - 2.428571 - ] - - """ - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: - ''' - Exponentially-weighted moving standard deviation. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.ewm_std(com=1) - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 0.707107 - 0.963624 - ] - - ''' - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: - ''' - Exponentially-weighted moving variance. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.ewm_var(com=1) - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 0.5 - 0.928571 - ] - - ''' - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: - """ - Extremely fast method for extending the Series with 'n' copies of a value. - - Parameters - ---------- - value - A constant literal value (not an expression) with which to extend - the Series; can pass None to extend with nulls. - n - The number of additional values that will be added. - - Examples - -------- - >>> s = pl.Series([1, 2, 3]) - >>> s.extend_constant(99, n=2) - shape: (5,) - Series: '' [i64] - [ - 1 - 2 - 3 - 99 - 99 - ] - - """ - def set_sorted(self) -> Self: - ''' - Flags the Series as \'sorted\'. - - Enables downstream code to user fast paths for sorted arrays. - - Parameters - ---------- - descending - If the `Series` order is descending. - - Warnings - -------- - This can lead to incorrect results if this `Series` is not sorted!! - Use with care! - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.set_sorted().max() - 3 - - ''' - def new_from_index(self, index: int, length: int) -> Self: - """Create a new Series filled with values from the given index.""" - def shrink_dtype(self) -> Series: - """ - Shrink numeric columns to the minimal required datatype. - - Shrink to the dtype needed to fit the extrema of this [`Series`]. - This can be used to reduce memory pressure. - """ - def get_chunks(self) -> list[Series]: - """Get the chunks of this Series as a list of Series.""" - def implode(self) -> Self: - """Aggregate values into a list.""" - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - """ - Apply a custom/user-defined function (UDF) over elements in this Series. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Series.map_elements`. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output datatype. If none is given, the same datatype as this Series will be - used. - skip_nulls - Nulls will be skipped and not passed to the python function. - This is faster because python can be skipped and because we call - more specialized functions. - - """ - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - """ - Apply a custom rolling window function. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Series.rolling_map`. - - Parameters - ---------- - function - Aggregation function - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - """ - def is_first(self) -> Series: - """ - Return a boolean mask indicating the first occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Series.is_first_distinct`. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - """ - def is_last(self) -> Series: - """ - Return a boolean mask indicating the last occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Series.is_last_distinct`. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - """ - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: - """ - Clip (limit) the values in an array to a `min` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - lower_bound - Lower bound. - - """ - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: - """ - Clip (limit) the values in an array to a `max` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - upper_bound - Upper bound. - - """ - def shift_and_fill(self, fill_value: int | Expr) -> Series: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - Fill None values with the result of this expression. - n - Number of places to shift (may be negative). - - """ - def is_float(self) -> bool: - ''' - Check if this Series has floating point numbers. - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_float()` instead. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0]) - >>> s.is_float() # doctest: +SKIP - True - - ''' - def is_integer(self, signed: bool | None = ...) -> bool: - ''' - Check if this Series datatype is an integer (signed or unsigned). - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_integer()` instead. - For signed/unsigned variants, use `Series.dtype.is_signed_integer()` - or `Series.dtype.is_unsigned_integer()`. - - Parameters - ---------- - signed - * if `None`, both signed and unsigned integer dtypes will match. - * if `True`, only signed integer dtypes will be considered a match. - * if `False`, only unsigned integer dtypes will be considered a match. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) - >>> s.is_integer() # doctest: +SKIP - True - >>> s.is_integer(signed=False) # doctest: +SKIP - True - >>> s.is_integer(signed=True) # doctest: +SKIP - False - - ''' - def is_numeric(self) -> bool: - ''' - Check if this Series datatype is numeric. - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_float()` instead. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.is_numeric() # doctest: +SKIP - True - - ''' - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: - """ - Check if this Series datatype is temporal. - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_temporal()` instead. - - Parameters - ---------- - excluding - Optionally exclude one or more temporal dtypes from matching. - - Examples - -------- - >>> from datetime import date - >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) - >>> s.is_temporal() # doctest: +SKIP - True - >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP - False - - """ - def is_boolean(self) -> bool: - ''' - Check if this Series is a Boolean. - - .. deprecated:: 0.19.14 - Use `Series.dtype == pl.Boolean` instead. - - Examples - -------- - >>> s = pl.Series("a", [True, False, True]) - >>> s.is_boolean() # doctest: +SKIP - True - - ''' - def is_utf8(self) -> bool: - ''' - Check if this Series datatype is a Utf8. - - .. deprecated:: 0.19.14 - Use `Series.dtype == pl.Utf8` instead. - - Examples - -------- - >>> s = pl.Series("x", ["a", "b", "c"]) - >>> s.is_utf8() # doctest: +SKIP - True - - ''' - def take_every(self, n: int) -> Series: - """ - Take every nth value in the Series and return as new Series. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: - """ - Take values by index. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather`. - - Parameters - ---------- - indices - Index location used for selection. - """ - def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: - """ - Set values at the index locations. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`scatter`. - - Parameters - ---------- - indices - Integers representing the index locations. - values - Replacement values. - """ - def cumsum(self) -> Series: - """ - Get an array with the cumulative sum computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_sum`. - - Parameters - ---------- - reverse - reverse the operation. - - """ - def cummax(self) -> Series: - """ - Get an array with the cumulative max computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_max`. - - Parameters - ---------- - reverse - reverse the operation. - """ - def cummin(self) -> Series: - """ - Get an array with the cumulative min computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_min`. - - Parameters - ---------- - reverse - reverse the operation. - """ - def cumprod(self) -> Series: - """ - Get an array with the cumulative product computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_prod`. - - Parameters - ---------- - reverse - reverse the operation. - """ - def view(self) -> SeriesView: - """ - Get a view into this Series data with a numpy array. - - .. deprecated:: 0.19.14 - This method will be removed in a future version. - - This operation doesn't clone data, but does not include missing values. - Don't use this unless you know what you are doing. - - Parameters - ---------- - ignore_nulls - If True then nulls are converted to 0. - If False then an Exception is raised if nulls are present. - - """ - def map_dict(self, mapping: dict[Any, Any]) -> Self: - """ - Replace values in the Series using a remapping dictionary. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`replace`. The default behavior - has changed to keep any values not present in the mapping unchanged. - Pass `default=None` to keep existing behavior. - - Parameters - ---------- - mapping - Dictionary containing the before/after values to map. - default - Value to use when the remapping dict does not contain the lookup value. - Use `pl.first()`, to keep the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - """ - def series_equal(self, other: Series) -> bool: - """ - Check whether the Series is equal to another Series. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`equals`. - - Parameters - ---------- - other - Series to compare with. - null_equal - Consider null values as equal. - strict - Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a - `pl.Int64` will return `False`. - """ - @property - def dtype(self): ... - @property - def flags(self): ... - @property - def inner_dtype(self): ... - @property - def name(self): ... - @property - def shape(self): ... - @property - def bin(self): ... - @property - def cat(self): ... - @property - def dt(self): ... - @property - def list(self): ... - @property - def arr(self): ... - @property - def str(self): ... - @property - def struct(self): ... -def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: - """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/series/series.pyi similarity index 99% rename from polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/series/series rename to polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/series/series.pyi index 4a40006..33dda6d 100644 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.14/polars/series/series +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.19/polars/series/series.pyi @@ -1,3 +1,4 @@ +#: version 0.19.19 import np as np import pa as pa import pd as pd @@ -4761,7 +4762,7 @@ class Series: Check if this Series datatype is numeric. .. deprecated:: 0.19.13 - Use `Series.dtype.is_float()` instead. + Use `Series.dtype.is_numeric()` instead. Examples -------- diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/dataframe/frame deleted file mode 100644 index 4b53c30..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/dataframe/frame +++ /dev/null @@ -1,300 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase, TextIOWrapper -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions import col as col, lit as lit -from polars.interchange.dataframe import PolarsDataFrame as PolarsDataFrame -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnFormatDict as ColumnFormatDict, ColumnNameOrSelector as ColumnNameOrSelector, ColumnTotalsDefinition as ColumnTotalsDefinition, ColumnWidthsDefinition as ColumnWidthsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IndexOrder as IndexOrder, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from typing import Any, BinaryIO, Callable, ClassVar, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: ClassVar[set[str]] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., schema: None | SchemaDict = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ..., ignore_errors: bool = ...) -> Self: ... - def _replace(self, column: str, new_column: Series) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def flags(self) -> dict[str, dict[str, bool]]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Series]: ... - def __reversed__(self) -> Iterator[Series]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ..., *, order: IndexOrder = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path, *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: ColumnFormatDict | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., header_format: dict[str, Any] | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: ColumnWidthsDefinition | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ..., freeze_panes: str | tuple[int, int] | tuple[str, int, int] | tuple[int, int, int, int] | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - @overload - def write_ipc_stream(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, Any] | None = ...) -> None: ... - def write_database(self, table_name: str, connection: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: str | Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: ... - @overload - def glimpse(self, *, max_items_per_column: int = ..., max_colname_length: int = ..., return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, max_items_per_column: int = ..., max_colname_length: int = ..., return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, other: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: DataFrame) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> DataFrame: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., *, separator: str = ..., drop_first: bool = ...) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def approx_n_unique(self) -> DataFrame: ... - def approx_unique(self) -> DataFrame: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *, named: bool = ..., include_key: bool = ..., unique: bool = ...) -> dict[Any, Iterable[Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def corr(self, **kwargs: Any) -> DataFrame: ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/dataframe/frame.pyi new file mode 100644 index 0000000..ed9937a --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/dataframe/frame.pyi @@ -0,0 +1,6586 @@ +#: version 0.19.2 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.excel._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalise_filepath as normalise_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a DataFrame object implementing the DataFrame interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to ``True`` will raise a ``NotImplementedError``. + allow_copy + Allow memory to be copied to perform the conversion. If set to ``False``, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars dataframe to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any, floordiv: bool) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self, structured: bool = ...) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + ``structured`` is set to ``False`` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname(s):str,}`` or ``{selector:str,}`` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in ``dtype_formats``. + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A ``{key:value,}`` dictionary of ``xlsxwriter`` format options to apply + to the table header row, such as ``{"bold":True, "font_color":"#702963"}``. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` or ``{selector:int,}`` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible ``xlsxwriter`` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path to which the IPC record batch data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + If you pass ``partition_cols`` here, the dataset will be written + using ``pyarrow.parquet.write_to_dataset``. + The ``partition_cols`` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, *args, **kwargs) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Some other data types are not supported but have an associated `primitive type + `__ + to which they can be cast. This affects the following data types: + + - Unsigned integers + - :class:`Datetime` types with millisecond or nanosecond precision or with + time zone information + - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a dataframe as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, *args, **kwargs) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + Returns a new DataFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``group_by_dynamic`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.group_by_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, *args, **kwargs) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from ``vstack`` which adds the chunks from ``other`` to the chunks of + this ``DataFrame``, ``extend`` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``vstack`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer ``vstack`` over ``extend`` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single ``DataFrame``. In the latter case, finish the sequence of + ``vstack`` operations with a ``rechunk``. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self, *args, **kwargs) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`DataFrame.approx_n_unique`. + + """ + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using ``iter_rows`` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy ``corrcoef``. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def groupby(self, *args, **kwargs) -> GroupBy: + """ + Start a group by operation. + + Alias for :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, *args, **kwargs) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + Alias for :func:`DataFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, *args, **kwargs) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Alias for :func:`DataFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, *args, **kwargs) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/expr/expr deleted file mode 100644 index fe2438c..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/expr/expr +++ /dev/null @@ -1,270 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, MapElementsStrategy as MapElementsStrategy, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: ClassVar[set[str]] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int | bool) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int | bool) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr | int | bool) -> Self: ... - def __rxor__(self, other: Any) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self, *, ignore_nulls: bool = ...) -> Self: ... - def all(self, *, ignore_nulls: bool = ...) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def cbrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def keep_name(self) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def is_not(self) -> Self: ... - def not_(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: IntoExpr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first(self) -> Self: ... - def is_last(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: ... - def qcut(self, quantiles: Sequence[float] | int, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> Self: ... - def rle(self) -> Self: ... - def rle_id(self) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | None | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def degrees(self) -> Self: ... - def radians(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/expr/expr.pyi new file mode 100644 index 0000000..6d2fb87 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/expr/expr.pyi @@ -0,0 +1,7882 @@ +#: version 0.19.2 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self, *args, **kwargs) -> Self: + ''' + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self, *args, **kwargs) -> Self: + ''' + Return whether all values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().map_alias(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").keep_name()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self, *args, **kwargs) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + Warnings + -------- + Note that null values are not floating point NaN values! + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ b │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + │ 4.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop floating point NaN values. + + Warnings + -------- + Note that NaN values are not null values! + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4.0, 4.0, float("nan")], + ... } + ... ) + >>> df.select(pl.col("b").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 4.0 │ + │ 4.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: Expr | int | float | Series, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first(self) -> Self: + ''' + Get a mask of the first unique value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_first().alias("is_first")) + shape: (5, 2) + ┌─────┬──────────┐ + │ num ┆ is_first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪══════════╡ + │ 1 ┆ true │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ false │ + │ 5 ┆ true │ + └─────┴──────────┘ + + ''' + def is_last(self) -> Self: + ''' + Get a mask of the last unique value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "num": [1, 2, 3, 1, 5], + ... } + ... ) + >>> df.with_columns(pl.col("num").is_last().alias("is_last")) + shape: (5, 2) + ┌─────┬─────────┐ + │ num ┆ is_last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 1 ┆ true │ + │ 5 ┆ true │ + └─────┴─────────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Get mask of duplicated values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to ``True``, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for ``map`` functions is transforming the values + represented by an expression using a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Notes + ----- + If you are looking to map a function over a window function or groupby context, + refer to func:`map_elements` instead. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + map_elements + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type ``Callable[[Any], Any]``. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type ``Callable[[Series], Any]``. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be ``pl.Unknown``. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using ``map_elements`` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using ``over`` is considered a GroupBy context + here, so ``map_elements`` can be used to map functions over window groups. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using ``over`` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, *args, **kwargs) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, *args, **kwargs) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, *args, **kwargs) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, *args, **kwargs) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, *args, **kwargs) -> Self: + ''' + Compute a rolling standard deviation. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, *args, **kwargs) -> Self: + ''' + Compute a rolling variance. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.date_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, *args, **kwargs) -> Self: + ''' + Compute a rolling median. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, *args, **kwargs) -> Self: + ''' + Compute a rolling quantile. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the ``window_size - 1`` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self, *args, **kwargs) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self, *args, **kwargs) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def map(self, *args, **kwargs) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, *args, **kwargs) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, *args, **kwargs) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/lazyframe/frame deleted file mode 100644 index fd22e6d..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/lazyframe/frame +++ /dev/null @@ -1,156 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import dataframe_api_compat as dataframe_api_compat, subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, ColumnNameOrSelector as ColumnNameOrSelector, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._async import _AsyncDataFrameResult as _AsyncDataFrameResult -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath -from queue import Queue -from typing import Any, Callable, ClassVar, Collection, Concatenate, Iterable, Literal, Mapping, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: ClassVar[set[str]] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., schema: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def deserialize(cls, source: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def _comparison_error(self, operator: str) -> NoReturn: ... - def __eq__(self, other: Any) -> NoReturn: ... - def __ne__(self, other: Any) -> NoReturn: ... - def __gt__(self, other: Any) -> NoReturn: ... - def __lt__(self, other: Any) -> NoReturn: ... - def __ge__(self, other: Any) -> NoReturn: ... - def __le__(self, other: Any) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def serialize(self, file: None = ...) -> str: ... - @overload - def serialize(self, file: IOBase | str | Path) -> None: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | Path | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def collect_async(self, queue: Queue[DataFrame | Exception], *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> _AsyncDataFrameResult[DataFrame]: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_csv(self, path: str | Path, *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, predicate: IntoExpr) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other: Self | list[Self]) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: ... - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map_batches(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..eae35e3 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/lazyframe/frame.pyi @@ -0,0 +1,3981 @@ +#: version 0.19.2 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AsyncDataFrameResult as _AsyncDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalise_filepath as normalise_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, pyarrow: bool = ...) -> Self: ... + @classmethod + def from_json(cls, *args, **kwargs) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to ``StringIO`` + and then use ``LazyFrame.deserialize``. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, *args, **kwargs) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to ``deserialize``. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def write_json(self, *args, **kwargs) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.serialize`. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + """ + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self, *args, **kwargs) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``True``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self, *args, **kwargs) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self, *args, **kwargs) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self, *args, **kwargs) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self, queue: Queue[DataFrame | Exception]) -> _AsyncDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame, like :func:`collect` + but instead of returning DataFrame directly its collected inside thread pool + and gets put into `queue` with `put_nowait` method, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + You must use correct queue in that case. + Given `queue` must be thread safe! + + For gevent use + [`gevent.queue.Queue`](https://www.gevent.org/api/gevent.queue.html#gevent.queue.Queue). + + For asyncio + [`asyncio.queues.Queue`](https://docs.python.org/3/library/asyncio-queue.html#queue) + can not be used, since it\'s not thread safe! + For that purpose use [janus](https://github.com/aio-libs/janus) library. + + Notes + ----- + Results are put in queue exactly once using `put_nowait`. + If error occurred then Exception will be put in the queue instead of result + which is then raised by returned wrapper `get` method. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + Wrapper that has `get` method and `queue` attribute with given queue. + `get` accepts kwargs that are passed down to `queue.get`. + + Examples + -------- + >>> import queue + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> a = ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async(queue.Queue()) + ... ) + >>> a.get() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def fetch(self, *args, **kwargs) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Fetch is like a :func:`collect` operation, but it overwrites the number of rows + read by every scan operation. This is a utility that helps debug a query on a + smaller number of rows. + + Note that the fetch does not guarantee the final number of rows in the + DataFrame. Filter, join operations and a lower number of rows available in the + scanned file influence the final number of rows. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: IntoExpr) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_group_by`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.group_by_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_rolling + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.date_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self, *args, **kwargs) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.approx_n_unique`. + + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + Returns a new LazyFrame. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def groupby(self, *args, **kwargs) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, *args, **kwargs) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, *args, **kwargs) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, *args, **kwargs) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/series/series deleted file mode 100644 index fc5c030..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/series/series +++ /dev/null @@ -1,364 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.deprecation import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Generator, Literal, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: ClassVar[set[str]] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> tuple[int, int, int]: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> Any: ... - def __setstate__(self, state: Any) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Series: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Series: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Series: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Series: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Series: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Series: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Series: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - @overload - def __floordiv__(self, other: Expr) -> Expr: ... - @overload - def __floordiv__(self, other: Any) -> Series: ... - def __invert__(self) -> Series: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def __column_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _repr_html_(self) -> str: ... - def item(self, row: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def cbrt(self) -> Series: ... - @overload - def any(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def any(self, *, ignore_nulls: bool) -> bool | None: ... - @overload - def all(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def all(self, *, ignore_nulls: bool) -> bool | None: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | None | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: bool) -> Series | DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: bool) -> Series | DataFrame: ... - def rle(self) -> Series: ... - def rle_id(self) -> Series: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool | None = ...) -> Self: ... - def extend(self, other: Series) -> Self: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def not_(self) -> Series: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first(self) -> Series: ... - def is_last(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series[Any]: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/series/series.pyi new file mode 100644 index 0000000..7e66eb6 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.2/polars/series/series.pyi @@ -0,0 +1,4600 @@ +#: version 0.19.2 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.deprecation import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the ``Series`` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator ``series == other`` where `None` == None`. + + This differs from the standard ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator ``series != other`` where `None` == None`. + + This differs from the standard ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, row: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given row index. + + If no row index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With a row index, this is equivalent to ``s[row]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self, *args, **kwargs) -> bool | None: + """ + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self, *args, **kwargs) -> bool | None: + """ + Return whether all values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + """ + Drop all null values. + + Creates a new Series that copies data from this Series without null values. + """ + def drop_nans(self) -> Series: + """Drop NaN values.""" + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + category_label + Name of the category column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to ``False``, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting ``include_breaks=True``, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, *args, **kwargs) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to ``True``, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + category_label + Name of the category column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + as_series + If set to ``False``, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting ``include_breaks=True``, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴────────┘ + + Sort the output by count. + + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and ``append`` will change to always + behave like ``append_chunks=True`` (the previous default). For the + behavior of ``append_chunks=False``, use ``Series.extend``. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from ``append``, which adds the chunks from ``other`` to the chunks of + this series, ``extend`` appends the data from ``other`` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``append`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer ``append`` over ``extend`` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single ``Series``. In the latter case, finish the sequence + of ``append`` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first(self) -> Series: + """ + Get a mask of the first unique value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Get a mask of the last unique value. + + Returns + ------- + Boolean Series + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point ``nan`` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set ``zero_copy_only=True``. + + Alternatively, if you want a zero-copy view and know what you are doing, + use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + Series + The mutated series. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the ``window_size - 1`` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a random + seed is generated using the ``random`` module. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, *args, **kwargs) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/dataframe/frame deleted file mode 100644 index 450b010..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/dataframe/frame +++ /dev/null @@ -1,300 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase, TextIOWrapper -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions import col as col, lit as lit -from polars.interchange.dataframe import PolarsDataFrame as PolarsDataFrame -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnFormatDict as ColumnFormatDict, ColumnNameOrSelector as ColumnNameOrSelector, ColumnTotalsDefinition as ColumnTotalsDefinition, ColumnWidthsDefinition as ColumnWidthsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IndexOrder as IndexOrder, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SelectorType as SelectorType, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from typing import Any, BinaryIO, Callable, ClassVar, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: ClassVar[set[str]] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., schema: None | SchemaDict = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ..., ignore_errors: bool = ...) -> Self: ... - def _replace(self, column: str, new_column: Series) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def flags(self) -> dict[str, dict[str, bool]]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, *, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state: list[Series]) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Series]: ... - def __reversed__(self) -> Iterator[Series]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ..., *, order: IndexOrder = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path, *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: ColumnFormatDict | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., header_format: dict[str, Any] | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: ColumnWidthsDefinition | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | SelectorType | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ..., freeze_panes: str | tuple[int, int] | tuple[str, int, int] | tuple[int, int, int, int] | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - @overload - def write_ipc_stream(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, Any] | None = ...) -> None: ... - def write_database(self, table_name: str, connection: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: str | Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: ... - @overload - def glimpse(self, *, max_items_per_column: int = ..., max_colname_length: int = ..., return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, max_items_per_column: int = ..., max_colname_length: int = ..., return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, other: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: DataFrame) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> DataFrame: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., *, separator: str = ..., drop_first: bool = ...) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def approx_n_unique(self) -> DataFrame: ... - def approx_unique(self) -> DataFrame: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *, named: bool = ..., include_key: bool = ..., unique: bool = ...) -> dict[Any, Iterable[Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def corr(self, **kwargs: Any) -> DataFrame: ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/dataframe/frame.pyi new file mode 100644 index 0000000..61f5738 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/dataframe/frame.pyi @@ -0,0 +1,6596 @@ +#: version 0.19.3 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to ``True`` will raise a ``NotImplementedError``. + allow_copy + Allow memory to be copied to perform the conversion. If set to ``False``, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars dataframe to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + ``structured`` is set to ``False`` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname(s):str,}`` or ``{selector:str,}`` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in ``dtype_formats``. + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A ``{key:value,}`` dictionary of ``xlsxwriter`` format options to apply + to the table header row, such as ``{"bold":True, "font_color":"#702963"}``. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` or ``{selector:int,}`` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible ``xlsxwriter`` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path to which the IPC record batch data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + If you pass ``partition_cols`` here, the dataset will be written + using ``pyarrow.parquet.write_to_dataset``. + The ``partition_cols`` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, *args, **kwargs) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Some other data types are not supported but have an associated `primitive type + `__ + to which they can be cast. This affects the following data types: + + - Unsigned integers + - :class:`Datetime` types with millisecond or nanosecond precision or with + time zone information + - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a dataframe as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, *args, **kwargs) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``group_by_dynamic`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.group_by_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ) + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg( + ... [pl.col("time").count().alias("time_count")] + ... ) + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg([pl.col("time").count().alias("time_count")]) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, *args, **kwargs) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from ``vstack`` which adds the chunks from ``other`` to the chunks of + this ``DataFrame``, ``extend`` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``vstack`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer ``vstack`` over ``extend`` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single ``DataFrame``. In the latter case, finish the sequence of + ``vstack`` operations with a ``rechunk``. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self, *args, **kwargs) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`DataFrame.approx_n_unique`. + + """ + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using ``iter_rows`` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy ``corrcoef``. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def groupby(self, *args, **kwargs) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, *args, **kwargs) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, *args, **kwargs) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, *args, **kwargs) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/expr/expr deleted file mode 100644 index b06b5f8..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/expr/expr +++ /dev/null @@ -1,273 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, MapElementsStrategy as MapElementsStrategy, NullBehavior as NullBehavior, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: ClassVar[set[str]] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int | bool) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int | bool) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr | int | bool) -> Self: ... - def __rxor__(self, other: Any) -> Self: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self, *, ignore_nulls: bool = ...) -> Self: ... - def all(self, *, ignore_nulls: bool = ...) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def cbrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def keep_name(self) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def is_not(self) -> Self: ... - def not_(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: IntoExpr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Self: ... - def bottom_k(self, k: int = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first_distinct(self) -> Self: ... - def is_last_distinct(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: ... - def qcut(self, quantiles: Sequence[float] | int, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> Self: ... - def rle(self) -> Self: ... - def rle_id(self) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | None | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: ... - def clip_min(self, lower_bound: int | float) -> Self: ... - def clip_max(self, upper_bound: int | float) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def degrees(self) -> Self: ... - def radians(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def is_first(self) -> Self: ... - def is_last(self) -> Self: ... - def _register_plugin(self, lib: str, symbol: str, args: list[IntoExpr] | None = ..., *, is_elementwise: bool = ..., input_wildcard_expansion: bool = ..., auto_explode: bool = ..., cast_to_supertypes: bool = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/expr/expr.pyi new file mode 100644 index 0000000..e0bb6f1 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/expr/expr.pyi @@ -0,0 +1,7965 @@ +#: version 0.19.3 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self, *args, **kwargs) -> Self: + ''' + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self, *args, **kwargs) -> Self: + ''' + Return whether all values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().map_alias(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").keep_name()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self, *args, **kwargs) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to ``True``, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for ``map`` functions is transforming the values + represented by an expression using a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Notes + ----- + If you are looking to map a function over a window function or groupby context, + refer to func:`map_elements` instead. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + map_elements + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type ``Callable[[Any], Any]``. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type ``Callable[[Series], Any]``. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be ``pl.Unknown``. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using ``map_elements`` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using ``over`` is considered a GroupBy context + here, so ``map_elements`` can be used to map functions over window groups. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using ``over`` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, *args, **kwargs) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, *args, **kwargs) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, *args, **kwargs) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, *args, **kwargs) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, *args, **kwargs) -> Self: + ''' + Compute a rolling standard deviation. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, *args, **kwargs) -> Self: + ''' + Compute a rolling variance. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, *args, **kwargs) -> Self: + ''' + Compute a rolling median. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, *args, **kwargs) -> Self: + ''' + Compute a rolling quantile. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the ``window_size - 1`` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: int | float) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self, *args, **kwargs) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self, *args, **kwargs) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def map(self, *args, **kwargs) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, *args, **kwargs) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, *args, **kwargs) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self, *args, **kwargs) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def is_last(self, *args, **kwargs) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def _register_plugin(self, lib: str, symbol: str, args: list[IntoExpr] | None = ...) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by ``lib::symbol`` + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + auto_explode + Explode the results in a group_by. + This is recommended for aggregation functions. + cast_to_supertypes + Cast the input datatypes to their supertype. + + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/lazyframe/frame deleted file mode 100644 index 3311840..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/lazyframe/frame +++ /dev/null @@ -1,159 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import dataframe_api_compat as dataframe_api_compat, subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, ColumnNameOrSelector as ColumnNameOrSelector, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalize_filepath as normalize_filepath -from typing import Any, Awaitable, Callable, ClassVar, Collection, Concatenate, Iterable, Literal, Mapping, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: ClassVar[set[str]] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., schema: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, *, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def deserialize(cls, source: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def _comparison_error(self, operator: str) -> NoReturn: ... - def __eq__(self, other: Any) -> NoReturn: ... - def __ne__(self, other: Any) -> NoReturn: ... - def __gt__(self, other: Any) -> NoReturn: ... - def __lt__(self, other: Any) -> NoReturn: ... - def __ge__(self, other: Any) -> NoReturn: ... - def __le__(self, other: Any) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def serialize(self, file: None = ...) -> str: ... - @overload - def serialize(self, file: IOBase | str | Path) -> None: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | Path | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ..., **kwargs: Any) -> DataFrame: ... - @overload - def collect_async(self, *, gevent: Literal[True], type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> _GeventDataFrameResult[DataFrame]: ... - @overload - def collect_async(self, *, gevent: Literal[False] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> Awaitable[DataFrame]: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_csv(self, path: str | Path, *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def _set_sink_optimizations(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> PyLazyFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, predicate: IntoExpr) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other: Self | list[Self]) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: ... - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map_batches(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..3f88125 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/lazyframe/frame.pyi @@ -0,0 +1,4011 @@ +#: version 0.19.3 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def from_json(cls, *args, **kwargs) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to ``StringIO`` + and then use ``LazyFrame.deserialize``. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, *args, **kwargs) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to ``deserialize``. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def write_json(self, *args, **kwargs) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.serialize`. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + """ + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self, *args, **kwargs) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``True``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self, *args, **kwargs) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self, *args, **kwargs) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self, *args, **kwargs: Any) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + **kwargs + For internal use. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + dataframe directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + ... + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, *args, **kwargs) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that ``fetch`` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if ``n_rows`` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: IntoExpr) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_group_by`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.group_by_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. The time/index + window could be seen as a rolling window, with a window size determined by + dates/times/values instead of slots in the DataFrame. + + A window is defined by: + + - every: interval of the window + - period: length of the window + - offset: offset of the window + + The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_rolling + + Notes + ----- + If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + Examples + -------- + >>> from datetime import datetime + >>> # create an example dataframe + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... [ + ... pl.col("time").min().alias("time_min"), + ... pl.col("time").max().alias("time_max"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬─────────────────────┬─────────────────────┐ + │ time ┆ time_min ┆ time_max │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │ + ╞═════════════════════╪═════════════════════╪═════════════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │ + └─────────────────────┴─────────────────────┴─────────────────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + When closed="left", should not include right end of interval + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... [ + ... pl.col("time").count().alias("time_count"), + ... pl.col("time").alias("time_agg_list"), + ... ] + ... ).collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬───────────────────────────────────┐ + │ time ┆ time_count ┆ time_agg_list │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │ + ╞═════════════════════╪════════════╪═══════════════════════════════════╡ + │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16… │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16… │ + │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16… │ + │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │ + └─────────────────────┴────────────┴───────────────────────────────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("time").count().alias("time_count") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬────────────┐ + │ time ┆ time_count │ + │ --- ┆ --- │ + │ datetime[μs] ┆ u32 │ + ╞═════════════════════╪════════════╡ + │ 2021-12-15 23:00:00 ┆ 1 │ + │ 2021-12-16 00:00:00 ┆ 3 │ + │ 2021-12-16 01:00:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 3 │ + │ 2021-12-16 03:00:00 ┆ 1 │ + └─────────────────────┴────────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "groups": ["a", "a", "a", "b", "b", "a", "a"], + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬────────┐ + │ time ┆ groups │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪════════╡ + │ 2021-12-16 00:00:00 ┆ a │ + │ 2021-12-16 00:30:00 ┆ a │ + │ 2021-12-16 01:00:00 ┆ a │ + │ 2021-12-16 01:30:00 ┆ b │ + │ 2021-12-16 02:00:00 ┆ b │ + │ 2021-12-16 02:30:00 ┆ a │ + │ 2021-12-16 03:00:00 ┆ a │ + └─────────────────────┴────────┘ + >>> ( + ... lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ) + ... ).agg([pl.col("time").count().alias("time_count")]).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (3, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self, *args, **kwargs) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.approx_n_unique`. + + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def groupby(self, *args, **kwargs) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, *args, **kwargs) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, *args, **kwargs) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it is equal to \'every\' + offset + offset of the window if None and period is None it will be equal to negative + `every` + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Truncate the start of the window with the \'every\' argument. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, *args, **kwargs) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/series/series deleted file mode 100644 index b747810..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/series/series +++ /dev/null @@ -1,366 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, NullBehavior as NullBehavior, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.deprecation import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Generator, Literal, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: ClassVar[set[str]] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> tuple[int, int, int]: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Series: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Series: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Series: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Series: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Series: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Series: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Series: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - @overload - def __floordiv__(self, other: Expr) -> Expr: ... - @overload - def __floordiv__(self, other: Any) -> Series: ... - def __invert__(self) -> Series: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def __column_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _repr_html_(self) -> str: ... - def item(self, index: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def cbrt(self) -> Series: ... - @overload - def any(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def any(self, *, ignore_nulls: bool) -> bool | None: ... - @overload - def all(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def all(self, *, ignore_nulls: bool) -> bool | None: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | None | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: bool) -> Series | DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: bool) -> Series | DataFrame: ... - def rle(self) -> Series: ... - def rle_id(self) -> Series: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool | None = ...) -> Self: ... - def extend(self, other: Series) -> Self: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int = ...) -> Series: ... - def bottom_k(self, k: int = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def not_(self) -> Series: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first_distinct(self) -> Series: ... - def is_last_distinct(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series[Any]: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: ... - def clip_min(self, lower_bound: int | float) -> Series: ... - def clip_max(self, upper_bound: int | float) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def is_first(self) -> Series: ... - def is_last(self) -> Series: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/series/series.pyi new file mode 100644 index 0000000..d038626 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.3/polars/series/series.pyi @@ -0,0 +1,4705 @@ +#: version 0.19.3 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.deprecation import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the ``Series`` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator ``series == other`` where `None` == None`. + + This differs from the standard ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator ``series != other`` where `None` == None`. + + This differs from the standard ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, *args, **kwargs) -> Any: + ''' + Return the series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With an index, this is equivalent to ``s[index]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self, *args, **kwargs) -> bool | None: + """ + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self, *args, **kwargs) -> bool | None: + """ + Return whether all values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + category_label + Name of the category column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to ``False``, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting ``include_breaks=True``, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, *args, **kwargs) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to ``True``, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + category_label + Name of the category column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + as_series + If set to ``False``, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting ``include_breaks=True``, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴────────┘ + + Sort the output by count. + + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and ``append`` will change to always + behave like ``append_chunks=True`` (the previous default). For the + behavior of ``append_chunks=False``, use ``Series.extend``. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from ``append``, which adds the chunks from ``other`` to the chunks of + this series, ``extend`` appends the data from ``other`` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``append`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer ``append`` over ``extend`` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single ``Series``. In the latter case, finish the sequence + of ``append`` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is none, it means that there are no null values. + Use this to swiftly assert a Series does not have null values. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point ``nan`` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set ``zero_copy_only=True``. + + Alternatively, if you want a zero-copy view and know what you are doing, + use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + Series + The mutated series. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the ``window_size - 1`` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: int | float, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: int | float) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, *args, **kwargs) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/dataframe/frame deleted file mode 100644 index 245a8ef..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/dataframe/frame +++ /dev/null @@ -1,300 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase, TextIOWrapper -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions import col as col, lit as lit -from polars.interchange.dataframe import PolarsDataFrame as PolarsDataFrame -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnFormatDict as ColumnFormatDict, ColumnNameOrSelector as ColumnNameOrSelector, ColumnTotalsDefinition as ColumnTotalsDefinition, ColumnWidthsDefinition as ColumnWidthsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IndexOrder as IndexOrder, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Label as Label, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SelectorType as SelectorType, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from typing import Any, BinaryIO, Callable, ClassVar, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: ClassVar[set[str]] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., schema: None | SchemaDict = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ..., ignore_errors: bool = ...) -> Self: ... - def _replace(self, column: str, new_column: Series) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def flags(self) -> dict[str, dict[str, bool]]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, *, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state: list[Series]) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Series]: ... - def __reversed__(self) -> Iterator[Series]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ..., *, order: IndexOrder = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path, *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: ColumnFormatDict | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., header_format: dict[str, Any] | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: ColumnWidthsDefinition | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | SelectorType | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ..., freeze_panes: str | tuple[int, int] | tuple[str, int, int] | tuple[int, int, int, int] | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - @overload - def write_ipc_stream(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, Any] | None = ...) -> None: ... - def write_database(self, table_name: str, connection: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: str | Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: ... - @overload - def glimpse(self, *, max_items_per_column: int = ..., max_colname_length: int = ..., return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, max_items_per_column: int = ..., max_colname_length: int = ..., return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool | None = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., label: Label = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, other: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: DataFrame) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> DataFrame: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., *, separator: str = ..., drop_first: bool = ...) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def approx_n_unique(self) -> DataFrame: ... - def approx_unique(self) -> DataFrame: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | Series | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *, named: bool = ..., include_key: bool = ..., unique: bool = ...) -> dict[Any, Iterable[Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def corr(self, **kwargs: Any) -> DataFrame: ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/dataframe/frame.pyi new file mode 100644 index 0000000..98d8abe --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/dataframe/frame.pyi @@ -0,0 +1,6625 @@ +#: version 0.19.5 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to ``True`` will raise a ``NotImplementedError``. + allow_copy + Allow memory to be copied to perform the conversion. If set to ``False``, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars dataframe to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + ``structured`` is set to ``False`` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname(s):str,}`` or ``{selector:str,}`` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in ``dtype_formats``. + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A ``{key:value,}`` dictionary of ``xlsxwriter`` format options to apply + to the table header row, such as ``{"bold":True, "font_color":"#702963"}``. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` or ``{selector:int,}`` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible ``xlsxwriter`` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path to which the IPC record batch data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + If you pass ``partition_cols`` here, the dataset will be written + using ``pyarrow.parquet.write_to_dataset``. + The ``partition_cols`` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, *args, **kwargs) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Some other data types are not supported but have an associated `primitive type + `__ + to which they can be cast. This affects the following data types: + + - Unsigned integers + - :class:`Datetime` types with millisecond or nanosecond precision or with + time zone information + - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a dataframe as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, *args, **kwargs) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``group_by_dynamic`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.group_by_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is ``\'window\'``. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, *args, **kwargs) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from ``vstack`` which adds the chunks from ``other`` to the chunks of + this ``DataFrame``, ``extend`` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``vstack`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer ``vstack`` over ``extend`` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single ``DataFrame``. In the latter case, finish the sequence of + ``vstack`` operations with a ``rechunk``. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self, *args, **kwargs) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`DataFrame.approx_n_unique`. + + """ + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using ``iter_rows`` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy ``corrcoef``. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def groupby(self, *args, **kwargs) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, *args, **kwargs) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, *args, **kwargs) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is ``\'window\'``. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, *args, **kwargs) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/expr/expr deleted file mode 100644 index 5888bae..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/expr/expr +++ /dev/null @@ -1,273 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, IntoExprColumn as IntoExprColumn, MapElementsStrategy as MapElementsStrategy, NullBehavior as NullBehavior, NumericLiteral as NumericLiteral, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, TemporalLiteral as TemporalLiteral, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: ClassVar[set[str]] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int | bool) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int | bool) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr | int | bool) -> Self: ... - def __rxor__(self, other: Any) -> Self: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self, *, ignore_nulls: bool = ...) -> Self: ... - def all(self, *, ignore_nulls: bool = ...) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def cbrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def keep_name(self) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def is_not(self) -> Self: ... - def not_(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: IntoExpr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int | IntoExprColumn = ...) -> Self: ... - def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first_distinct(self) -> Self: ... - def is_last_distinct(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: ... - def qcut(self, quantiles: Sequence[float] | int, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> Self: ... - def rle(self) -> Self: ... - def rle_id(self) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | None | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: ... - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: ... - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def degrees(self) -> Self: ... - def radians(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | Expr | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def is_first(self) -> Self: ... - def is_last(self) -> Self: ... - def _register_plugin(self, lib: str, symbol: str, args: list[IntoExpr] | None = ..., *, is_elementwise: bool = ..., input_wildcard_expansion: bool = ..., auto_explode: bool = ..., cast_to_supertypes: bool = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/expr/expr.pyi new file mode 100644 index 0000000..776198a --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/expr/expr.pyi @@ -0,0 +1,7965 @@ +#: version 0.19.5 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self, *args, **kwargs) -> Self: + ''' + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self, *args, **kwargs) -> Self: + ''' + Return whether all values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().map_alias(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").keep_name()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self, *args, **kwargs) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to ``True``, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for ``map`` functions is transforming the values + represented by an expression using a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Notes + ----- + If you are looking to map a function over a window function or group_by context, + refer to func:`map_elements` instead. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + map_elements + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type ``Callable[[Any], Any]``. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type ``Callable[[Series], Any]``. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be ``pl.Unknown``. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using ``map_elements`` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using ``over`` is considered a GroupBy context + here, so ``map_elements`` can be used to map functions over window groups. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using ``over`` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, *args, **kwargs) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, *args, **kwargs) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, *args, **kwargs) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, *args, **kwargs) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, *args, **kwargs) -> Self: + ''' + Compute a rolling standard deviation. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, *args, **kwargs) -> Self: + ''' + Compute a rolling variance. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, *args, **kwargs) -> Self: + ''' + Compute a rolling median. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, *args, **kwargs) -> Self: + ''' + Compute a rolling quantile. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the ``window_size - 1`` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | Expr | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self, *args, **kwargs) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self, *args, **kwargs) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def map(self, *args, **kwargs) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, *args, **kwargs) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, *args, **kwargs) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self, *args, **kwargs) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def is_last(self, *args, **kwargs) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def _register_plugin(self, lib: str, symbol: str, args: list[IntoExpr] | None = ...) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by ``lib::symbol`` + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + auto_explode + Explode the results in a group_by. + This is recommended for aggregation functions. + cast_to_supertypes + Cast the input datatypes to their supertype. + + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/lazyframe/frame deleted file mode 100644 index 01ead39..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/lazyframe/frame +++ /dev/null @@ -1,159 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import dataframe_api_compat as dataframe_api_compat, subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, ColumnNameOrSelector as ColumnNameOrSelector, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Label as Label, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalize_filepath as normalize_filepath -from typing import Any, Awaitable, Callable, ClassVar, Collection, Concatenate, Iterable, Literal, Mapping, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: ClassVar[set[str]] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., schema: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ..., hive_partitioning: bool = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, *, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def deserialize(cls, source: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def _comparison_error(self, operator: str) -> NoReturn: ... - def __eq__(self, other: Any) -> NoReturn: ... - def __ne__(self, other: Any) -> NoReturn: ... - def __gt__(self, other: Any) -> NoReturn: ... - def __lt__(self, other: Any) -> NoReturn: ... - def __ge__(self, other: Any) -> NoReturn: ... - def __le__(self, other: Any) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def serialize(self, file: None = ...) -> str: ... - @overload - def serialize(self, file: IOBase | str | Path) -> None: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | Path | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ..., **kwargs: Any) -> DataFrame: ... - @overload - def collect_async(self, *, gevent: Literal[True], type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> _GeventDataFrameResult[DataFrame]: ... - @overload - def collect_async(self, *, gevent: Literal[False] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> Awaitable[DataFrame]: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_csv(self, path: str | Path, *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def _set_sink_optimizations(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> PyLazyFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, predicate: IntoExpr) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool | None = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., label: Label = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other: Self | list[Self]) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: ... - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map_batches(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..26f7ba6 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/lazyframe/frame.pyi @@ -0,0 +1,4007 @@ +#: version 0.19.5 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def from_json(cls, *args, **kwargs) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to ``StringIO`` + and then use ``LazyFrame.deserialize``. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, *args, **kwargs) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to ``deserialize``. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def write_json(self, *args, **kwargs) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.serialize`. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + """ + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self, *args, **kwargs) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``True``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self, *args, **kwargs) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self, *args, **kwargs) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self, *args, **kwargs: Any) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + **kwargs + For internal use. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + dataframe directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + ... + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, *args, **kwargs) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that ``fetch`` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if ``n_rows`` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: IntoExpr) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_group_by`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.group_by_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self, *args, **kwargs) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.approx_n_unique`. + + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def groupby(self, *args, **kwargs) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, *args, **kwargs) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, *args, **kwargs) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is ``\'window\'``. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, *args, **kwargs) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/series/series deleted file mode 100644 index 5c1f0fe..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/series/series +++ /dev/null @@ -1,366 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, IntoExprColumn as IntoExprColumn, NullBehavior as NullBehavior, NumericLiteral as NumericLiteral, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TemporalLiteral as TemporalLiteral -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.deprecation import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Generator, Literal, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: ClassVar[set[str]] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> tuple[int, int, int]: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Series: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Series: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Series: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Series: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Series: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Series: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Series: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - @overload - def __floordiv__(self, other: Expr) -> Expr: ... - @overload - def __floordiv__(self, other: Any) -> Series: ... - def __invert__(self) -> Series: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def __column_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _repr_html_(self) -> str: ... - def item(self, index: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def cbrt(self) -> Series: ... - @overload - def any(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def any(self, *, ignore_nulls: bool) -> bool | None: ... - @overload - def all(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def all(self, *, ignore_nulls: bool) -> bool | None: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | None | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: bool) -> Series | DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: bool) -> Series | DataFrame: ... - def rle(self) -> Series: ... - def rle_id(self) -> Series: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool | None = ...) -> Self: ... - def extend(self, other: Series) -> Self: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int | IntoExprColumn = ...) -> Series: ... - def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def not_(self) -> Series: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first_distinct(self) -> Series: ... - def is_last_distinct(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series[Any]: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: ... - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: ... - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def is_first(self) -> Series: ... - def is_last(self) -> Series: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/series/series.pyi new file mode 100644 index 0000000..cf6cd70 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.5/polars/series/series.pyi @@ -0,0 +1,4713 @@ +#: version 0.19.5 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.deprecation import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the ``Series`` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator ``series == other`` where `None` == None`. + + This differs from the standard ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator ``series != other`` where `None` == None`. + + This differs from the standard ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, *args, **kwargs) -> Any: + ''' + Return the series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With an index, this is equivalent to ``s[index]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self, *args, **kwargs) -> bool | None: + """ + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self, *args, **kwargs) -> bool | None: + """ + Return whether all values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + category_label + Name of the category column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to ``False``, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting ``include_breaks=True``, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, *args, **kwargs) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to ``True``, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + category_label + Name of the category column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + as_series + If set to ``False``, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting ``include_breaks=True``, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴────────┘ + + Sort the output by count. + + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and ``append`` will change to always + behave like ``append_chunks=True`` (the previous default). For the + behavior of ``append_chunks=False``, use ``Series.extend``. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from ``append``, which adds the chunks from ``other`` to the chunks of + this series, ``extend`` appends the data from ``other`` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``append`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer ``append`` over ``extend`` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single ``Series``. In the latter case, finish the sequence + of ``append`` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no ``null`` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have ``null`` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be ``false``. + + To confirm that a column has ``null`` values use :func:`null_count`. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point ``nan`` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set ``zero_copy_only=True``. + + Alternatively, if you want a zero-copy view and know what you are doing, + use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + Series + The mutated series. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the ``window_size - 1`` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, *args, **kwargs) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/dataframe/frame deleted file mode 100644 index 245a8ef..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/dataframe/frame +++ /dev/null @@ -1,300 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase, TextIOWrapper -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions import col as col, lit as lit -from polars.interchange.dataframe import PolarsDataFrame as PolarsDataFrame -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnFormatDict as ColumnFormatDict, ColumnNameOrSelector as ColumnNameOrSelector, ColumnTotalsDefinition as ColumnTotalsDefinition, ColumnWidthsDefinition as ColumnWidthsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IndexOrder as IndexOrder, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Label as Label, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SelectorType as SelectorType, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from typing import Any, BinaryIO, Callable, ClassVar, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: ClassVar[set[str]] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., schema: None | SchemaDict = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ..., ignore_errors: bool = ...) -> Self: ... - def _replace(self, column: str, new_column: Series) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def flags(self) -> dict[str, dict[str, bool]]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, *, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state: list[Series]) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Series]: ... - def __reversed__(self) -> Iterator[Series]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ..., *, order: IndexOrder = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path, *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: ColumnFormatDict | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., header_format: dict[str, Any] | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: ColumnWidthsDefinition | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | SelectorType | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ..., freeze_panes: str | tuple[int, int] | tuple[str, int, int] | tuple[int, int, int, int] | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - @overload - def write_ipc_stream(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, Any] | None = ...) -> None: ... - def write_database(self, table_name: str, connection: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: str | Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: ... - @overload - def glimpse(self, *, max_items_per_column: int = ..., max_colname_length: int = ..., return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, max_items_per_column: int = ..., max_colname_length: int = ..., return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool | None = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., label: Label = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, other: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: DataFrame) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> DataFrame: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., *, separator: str = ..., drop_first: bool = ...) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def approx_n_unique(self) -> DataFrame: ... - def approx_unique(self) -> DataFrame: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | Series | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *, named: bool = ..., include_key: bool = ..., unique: bool = ...) -> dict[Any, Iterable[Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def corr(self, **kwargs: Any) -> DataFrame: ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/dataframe/frame.pyi new file mode 100644 index 0000000..12412f9 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/dataframe/frame.pyi @@ -0,0 +1,6625 @@ +#: version 0.19.6 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to ``True`` will raise a ``NotImplementedError``. + allow_copy + Allow memory to be copied to perform the conversion. If set to ``False``, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars dataframe to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + ``structured`` is set to ``False`` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname(s):str,}`` or ``{selector:str,}`` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in ``dtype_formats``. + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A ``{key:value,}`` dictionary of ``xlsxwriter`` format options to apply + to the table header row, such as ``{"bold":True, "font_color":"#702963"}``. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` or ``{selector:int,}`` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible ``xlsxwriter`` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path to which the IPC record batch data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + If you pass ``partition_cols`` here, the dataset will be written + using ``pyarrow.parquet.write_to_dataset``. + The ``partition_cols`` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, *args, **kwargs) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Some other data types are not supported but have an associated `primitive type + `__ + to which they can be cast. This affects the following data types: + + - Unsigned integers + - :class:`Datetime` types with millisecond or nanosecond precision or with + time zone information + - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a dataframe as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, *args, **kwargs) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``group_by_dynamic`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.group_by_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is ``\'window\'``. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, *args, **kwargs) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from ``vstack`` which adds the chunks from ``other`` to the chunks of + this ``DataFrame``, ``extend`` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``vstack`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer ``vstack`` over ``extend`` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single ``DataFrame``. In the latter case, finish the sequence of + ``vstack`` operations with a ``rechunk``. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self, *args, **kwargs) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`DataFrame.approx_n_unique`. + + """ + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using ``iter_rows`` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy ``corrcoef``. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def groupby(self, *args, **kwargs) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, *args, **kwargs) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, *args, **kwargs) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is ``\'window\'``. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, *args, **kwargs) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/expr/expr deleted file mode 100644 index 5888bae..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/expr/expr +++ /dev/null @@ -1,273 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, IntoExprColumn as IntoExprColumn, MapElementsStrategy as MapElementsStrategy, NullBehavior as NullBehavior, NumericLiteral as NumericLiteral, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, TemporalLiteral as TemporalLiteral, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: ClassVar[set[str]] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int | bool) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int | bool) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr | int | bool) -> Self: ... - def __rxor__(self, other: Any) -> Self: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self, *, ignore_nulls: bool = ...) -> Self: ... - def all(self, *, ignore_nulls: bool = ...) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def cbrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def keep_name(self) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def is_not(self) -> Self: ... - def not_(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: IntoExpr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int | IntoExprColumn = ...) -> Self: ... - def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first_distinct(self) -> Self: ... - def is_last_distinct(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: ... - def qcut(self, quantiles: Sequence[float] | int, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> Self: ... - def rle(self) -> Self: ... - def rle_id(self) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | None | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: ... - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: ... - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def degrees(self) -> Self: ... - def radians(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | Expr | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def is_first(self) -> Self: ... - def is_last(self) -> Self: ... - def _register_plugin(self, lib: str, symbol: str, args: list[IntoExpr] | None = ..., *, is_elementwise: bool = ..., input_wildcard_expansion: bool = ..., auto_explode: bool = ..., cast_to_supertypes: bool = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/expr/expr.pyi new file mode 100644 index 0000000..968d0cf --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/expr/expr.pyi @@ -0,0 +1,7965 @@ +#: version 0.19.6 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self, *args, **kwargs) -> Self: + ''' + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self, *args, **kwargs) -> Self: + ''' + Return whether all values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().map_alias(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").keep_name()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self, *args, **kwargs) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift(1)) + shape: (4, 1) + ┌──────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + + ''' + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.select(pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to ``True``, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for ``map`` functions is transforming the values + represented by an expression using a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Notes + ----- + If you are looking to map a function over a window function or group_by context, + refer to func:`map_elements` instead. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + map_elements + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type ``Callable[[Any], Any]``. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type ``Callable[[Series], Any]``. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be ``pl.Unknown``. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using ``map_elements`` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using ``over`` is considered a GroupBy context + here, so ``map_elements`` can be used to map functions over window groups. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using ``over`` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, *args, **kwargs) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, *args, **kwargs) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, *args, **kwargs) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, *args, **kwargs) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, *args, **kwargs) -> Self: + ''' + Compute a rolling standard deviation. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, *args, **kwargs) -> Self: + ''' + Compute a rolling variance. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, *args, **kwargs) -> Self: + ''' + Compute a rolling median. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, *args, **kwargs) -> Self: + ''' + Compute a rolling quantile. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the ``window_size - 1`` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | Expr | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self, *args, **kwargs) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self, *args, **kwargs) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def map(self, *args, **kwargs) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, *args, **kwargs) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, *args, **kwargs) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self, *args, **kwargs) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def is_last(self, *args, **kwargs) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def _register_plugin(self, lib: str, symbol: str, args: list[IntoExpr] | None = ...) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by ``lib::symbol`` + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + auto_explode + Explode the results in a group_by. + This is recommended for aggregation functions. + cast_to_supertypes + Cast the input datatypes to their supertype. + + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/lazyframe/frame deleted file mode 100644 index b43cf3b..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/lazyframe/frame +++ /dev/null @@ -1,159 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import dataframe_api_compat as dataframe_api_compat, subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, ColumnNameOrSelector as ColumnNameOrSelector, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Label as Label, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalize_filepath as normalize_filepath -from typing import Any, Awaitable, Callable, ClassVar, Collection, Concatenate, Iterable, Literal, Mapping, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: ClassVar[set[str]] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., schema: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ..., hive_partitioning: bool = ..., retries: int = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, *, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def deserialize(cls, source: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def _comparison_error(self, operator: str) -> NoReturn: ... - def __eq__(self, other: Any) -> NoReturn: ... - def __ne__(self, other: Any) -> NoReturn: ... - def __gt__(self, other: Any) -> NoReturn: ... - def __lt__(self, other: Any) -> NoReturn: ... - def __ge__(self, other: Any) -> NoReturn: ... - def __le__(self, other: Any) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def serialize(self, file: None = ...) -> str: ... - @overload - def serialize(self, file: IOBase | str | Path) -> None: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | Path | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ..., **kwargs: Any) -> DataFrame: ... - @overload - def collect_async(self, *, gevent: Literal[True], type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> _GeventDataFrameResult[DataFrame]: ... - @overload - def collect_async(self, *, gevent: Literal[False] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> Awaitable[DataFrame]: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_csv(self, path: str | Path, *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def _set_sink_optimizations(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> PyLazyFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, predicate: IntoExpr) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool | None = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., label: Label = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other: Self | list[Self]) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: ... - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map_batches(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..43e4cea --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/lazyframe/frame.pyi @@ -0,0 +1,4007 @@ +#: version 0.19.6 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def from_json(cls, *args, **kwargs) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to ``StringIO`` + and then use ``LazyFrame.deserialize``. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, *args, **kwargs) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to ``deserialize``. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def write_json(self, *args, **kwargs) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.serialize`. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + """ + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self, *args, **kwargs) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``True``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self, *args, **kwargs) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self, *args, **kwargs) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self, *args, **kwargs: Any) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + **kwargs + For internal use. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + dataframe directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + ... + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, *args, **kwargs) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that ``fetch`` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if ``n_rows`` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: IntoExpr) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_group_by`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.group_by_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self, *args, **kwargs) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.approx_n_unique`. + + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\'} + \'left\' will keep the left table rows as is. + \'inner\' will remove rows that are not found in other + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def groupby(self, *args, **kwargs) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, *args, **kwargs) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, *args, **kwargs) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is ``\'window\'``. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, *args, **kwargs) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/series/series deleted file mode 100644 index 5c1f0fe..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/series/series +++ /dev/null @@ -1,366 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, IntoExprColumn as IntoExprColumn, NullBehavior as NullBehavior, NumericLiteral as NumericLiteral, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TemporalLiteral as TemporalLiteral -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.deprecation import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Generator, Literal, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: ClassVar[set[str]] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> tuple[int, int, int]: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Series: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Series: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Series: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Series: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Series: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Series: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Series: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - @overload - def __floordiv__(self, other: Expr) -> Expr: ... - @overload - def __floordiv__(self, other: Any) -> Series: ... - def __invert__(self) -> Series: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def __column_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _repr_html_(self) -> str: ... - def item(self, index: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def cbrt(self) -> Series: ... - @overload - def any(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def any(self, *, ignore_nulls: bool) -> bool | None: ... - @overload - def all(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def all(self, *, ignore_nulls: bool) -> bool | None: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | None | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: bool) -> Series | DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: bool) -> Series | DataFrame: ... - def rle(self) -> Series: ... - def rle_id(self) -> Series: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool | None = ...) -> Self: ... - def extend(self, other: Series) -> Self: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int | IntoExprColumn = ...) -> Series: ... - def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def not_(self) -> Series: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first_distinct(self) -> Series: ... - def is_last_distinct(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series[Any]: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: ... - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: ... - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def is_first(self) -> Series: ... - def is_last(self) -> Series: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/series/series.pyi new file mode 100644 index 0000000..d0e92cf --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.6/polars/series/series.pyi @@ -0,0 +1,4713 @@ +#: version 0.19.6 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.deprecation import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the ``Series`` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator ``series == other`` where `None` == None`. + + This differs from the standard ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator ``series != other`` where `None` == None`. + + This differs from the standard ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, *args, **kwargs) -> Any: + ''' + Return the series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With an index, this is equivalent to ``s[index]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self, *args, **kwargs) -> bool | None: + """ + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self, *args, **kwargs) -> bool | None: + """ + Return whether all values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + category_label + Name of the category column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to ``False``, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting ``include_breaks=True``, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, *args, **kwargs) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to ``True``, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + category_label + Name of the category column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + as_series + If set to ``False``, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting ``include_breaks=True``, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴────────┘ + + Sort the output by count. + + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and ``append`` will change to always + behave like ``append_chunks=True`` (the previous default). For the + behavior of ``append_chunks=False``, use ``Series.extend``. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from ``append``, which adds the chunks from ``other`` to the chunks of + this series, ``extend`` appends the data from ``other`` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``append`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer ``append`` over ``extend`` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single ``Series``. In the latter case, finish the sequence + of ``append`` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no ``null`` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have ``null`` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be ``false``. + + To confirm that a column has ``null`` values use :func:`null_count`. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point ``nan`` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set ``zero_copy_only=True``. + + Alternatively, if you want a zero-copy view and know what you are doing, + use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + Series + The mutated series. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the ``window_size - 1`` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, *args, **kwargs) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/dataframe/frame deleted file mode 100644 index 66a6414..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/dataframe/frame +++ /dev/null @@ -1,300 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import deltalake - -from datetime import timedelta -from io import BytesIO, IOBase, TextIOWrapper -from pathlib import Path -from polars import Expr as Expr, LazyFrame as LazyFrame, Series as Series -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes import Boolean as Boolean, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, NUMERIC_DTYPES as NUMERIC_DTYPES, N_INFER_DEFAULT as N_INFER_DEFAULT, Object as Object, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions import col as col, lit as lit -from polars.interchange.dataframe import PolarsDataFrame as PolarsDataFrame -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.polars import PyDataFrame as PyDataFrame -from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, AvroCompression as AvroCompression, ClosedInterval as ClosedInterval, ColumnFormatDict as ColumnFormatDict, ColumnNameOrSelector as ColumnNameOrSelector, ColumnTotalsDefinition as ColumnTotalsDefinition, ColumnWidthsDefinition as ColumnWidthsDefinition, ComparisonOperator as ComparisonOperator, ConditionalFormatDict as ConditionalFormatDict, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, DbWriteEngine as DbWriteEngine, DbWriteMode as DbWriteMode, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IndexOrder as IndexOrder, IntoExpr as IntoExpr, IpcCompression as IpcCompression, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Label as Label, NullStrategy as NullStrategy, OneOrMoreDataTypes as OneOrMoreDataTypes, Orientation as Orientation, ParallelStrategy as ParallelStrategy, ParquetCompression as ParquetCompression, PivotAgg as PivotAgg, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, RowTotalsDefinition as RowTotalsDefinition, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, SelectorType as SelectorType, SizeUnit as SizeUnit, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy, UnstackDirection as UnstackDirection -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from typing import Any, BinaryIO, Callable, ClassVar, Collection, Concatenate, Iterable, Iterator, Literal, Mapping, NoReturn, Sequence, TypeAlias, TypeVar, overload -from typing_extensions import Self -from xlsxwriter import Workbook as Workbook - -MultiRowSelector: TypeAlias -MultiColSelector: TypeAlias -T = TypeVar('T') -P = ParamSpec('P') - -class DataFrame(Generic[P]): - _accessors: ClassVar[set[str]] - _df: PyDataFrame - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: ... - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ...) -> Self: ... - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ...) -> Self: ... - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ...) -> Self: ... - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., rechunk: bool = ..., nan_to_null: bool = ..., include_index: bool = ...) -> Self: ... - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes, *, has_header: bool = ..., columns: Sequence[int] | Sequence[str] | None = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: None | SchemaDict | Sequence[PolarsDataType] = ..., schema: None | SchemaDict = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., try_parse_dates: bool = ..., n_threads: int | None = ..., infer_schema_length: int | None = ..., batch_size: int = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., sample_size: int = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> DataFrame: ... - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., parallel: ParallelStrategy = ..., row_count_name: str | None = ..., row_count_offset: int = ..., low_memory: bool = ..., use_statistics: bool = ..., rechunk: bool = ...) -> DataFrame: ... - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ...) -> Self: ... - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes, *, columns: Sequence[int] | Sequence[str] | None = ..., n_rows: int | None = ..., row_count_name: str | None = ..., row_count_offset: int = ..., rechunk: bool = ...) -> Self: ... - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes, *, schema: SchemaDefinition | None = ..., schema_overrides: SchemaDefinition | None = ..., ignore_errors: bool = ...) -> Self: ... - def _replace(self, column: str, new_column: Series) -> Self: ... - @property - def shape(self) -> tuple[int, int]: ... - @property - def height(self) -> int: ... - @property - def width(self) -> int: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def flags(self) -> dict[str, dict[str, bool]]: ... - @property - def schema(self) -> SchemaDict: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: ... - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: ... - def _div(self, other: Any, *, floordiv: bool) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __getstate__(self) -> list[Series]: ... - def __setstate__(self, state: list[Series]) -> None: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Series]: ... - def __reversed__(self) -> Iterator[Series]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - @overload - def __getitem__(self, item: str) -> Series: ... - @overload - def __getitem__(self, item: int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector]) -> Self: ... - @overload - def __getitem__(self, item: tuple[int, int | str]) -> Any: ... - @overload - def __getitem__(self, item: tuple[MultiRowSelector, int | str]) -> Series: ... - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: ... - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: ... - def to_arrow(self) -> pa.Table: ... - @overload - def to_dict(self, as_series: Literal[True] = ...) -> dict[str, Series]: ... - @overload - def to_dict(self, as_series: Literal[False]) -> dict[str, list[Any]]: ... - @overload - def to_dict(self, as_series: bool) -> dict[str, Series] | dict[str, list[Any]]: ... - def to_dicts(self) -> list[dict[str, Any]]: ... - def to_numpy(self, structured: bool = ..., *, order: IndexOrder = ...) -> np.ndarray[Any, Any]: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.DataFrame: ... - def to_series(self, index: int = ...) -> Series: ... - def to_init_repr(self, n: int = ...) -> str: ... - @overload - def write_json(self, file: None = ..., *, pretty: bool = ..., row_oriented: bool = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path, *, pretty: bool = ..., row_oriented: bool = ...) -> None: ... - @overload - def write_ndjson(self, file: None = ...) -> str: ... - @overload - def write_ndjson(self, file: IOBase | str | Path) -> None: ... - @overload - def write_csv(self, file: None = ..., *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> str: ... - @overload - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path, *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ...) -> None: ... - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: ... - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ..., *, position: tuple[int, int] | str = ..., table_style: str | dict[str, Any] | None = ..., table_name: str | None = ..., column_formats: ColumnFormatDict | None = ..., dtype_formats: dict[OneOrMoreDataTypes, str] | None = ..., conditional_formats: ConditionalFormatDict | None = ..., header_format: dict[str, Any] | None = ..., column_totals: ColumnTotalsDefinition | None = ..., column_widths: ColumnWidthsDefinition | None = ..., row_totals: RowTotalsDefinition | None = ..., row_heights: dict[int | tuple[int, ...], int] | int | None = ..., sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = ..., formulas: dict[str, str | dict[str, str]] | None = ..., float_precision: int = ..., has_header: bool = ..., autofilter: bool = ..., autofit: bool = ..., hidden_columns: Sequence[str] | SelectorType | None = ..., hide_gridlines: bool = ..., sheet_zoom: int | None = ..., freeze_panes: str | tuple[int, int] | tuple[str, int, int] | tuple[int, int, int, int] | None = ...) -> Workbook: ... - @overload - def write_ipc(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - @overload - def write_ipc_stream(self, file: None, compression: IpcCompression = ...) -> BytesIO: ... - @overload - def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path, compression: IpcCompression = ...) -> None: ... - def write_parquet(self, file: str | Path | BytesIO, *, compression: ParquetCompression = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., use_pyarrow: bool = ..., pyarrow_options: dict[str, Any] | None = ...) -> None: ... - def write_database(self, table_name: str, connection: str, *, if_exists: DbWriteMode = ..., engine: DbWriteEngine = ...) -> None: ... - def write_delta(self, target: str | Path | deltalake.DeltaTable, *, mode: Literal['error', 'append', 'overwrite', 'ignore'] = ..., overwrite_schema: bool = ..., storage_options: dict[str, str] | None = ..., delta_write_options: dict[str, Any] | None = ...) -> None: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def transpose(self, *, include_header: bool = ..., header_name: str = ..., column_names: str | Iterable[str] | None = ...) -> Self: ... - def reverse(self) -> DataFrame: ... - def rename(self, mapping: dict[str, str]) -> DataFrame: ... - def insert_at_idx(self, index: int, series: Series) -> Self: ... - def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: ... - @overload - def glimpse(self, *, max_items_per_column: int = ..., max_colname_length: int = ..., return_as_string: Literal[False]) -> None: ... - @overload - def glimpse(self, *, max_items_per_column: int = ..., max_colname_length: int = ..., return_as_string: Literal[True]) -> str: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: ... - def find_idx_by_name(self, name: str) -> int: ... - def replace_at_idx(self, index: int, series: Series) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ...) -> DataFrame: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> DataFrame: ... - def frame_equal(self, other: DataFrame, *, null_equal: bool = ...) -> bool: ... - def replace(self, column: str, new_column: Series) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: ... - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool | None = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., label: Label = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def upsample(self, time_column: str, *, every: str | timedelta, offset: str | timedelta | None = ..., by: str | Sequence[str] | None = ..., maintain_order: bool = ...) -> Self: ... - def join_asof(self, other: DataFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> DataFrame: ... - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ...) -> DataFrame: ... - def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - def hstack(self, columns: list[Series] | DataFrame, *, in_place: bool = ...) -> Self: ... - def vstack(self, other: DataFrame, *, in_place: bool = ...) -> Self: ... - def extend(self, other: DataFrame) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: ... - def drop_in_place(self, name: str) -> Series: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> DataFrame: ... - def clear(self, n: int = ...) -> Self: ... - def clone(self) -> Self: ... - def get_columns(self) -> list[Series]: ... - def get_column(self, name: str) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> DataFrame: ... - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: ... - def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ..., *, maintain_order: bool = ..., sort_columns: bool = ..., separator: str = ...) -> Self: ... - def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: ... - def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[False] = ...) -> list[Self]: ... - @overload - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: str, maintain_order: bool = ..., include_key: bool = ..., as_dict: Literal[True]) -> dict[Any, Self]: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: int | str | float, *, periods: int = ...) -> DataFrame: ... - def is_duplicated(self) -> Series: ... - def is_unique(self) -> Series: ... - def lazy(self) -> LazyFrame: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: ... - @overload - def n_chunks(self, strategy: Literal['first'] = ...) -> int: ... - @overload - def n_chunks(self, strategy: Literal['all']) -> list[int]: ... - @overload - def max(self, axis: Literal[0] = ...) -> Self: ... - @overload - def max(self, axis: Literal[1]) -> Series: ... - @overload - def max(self, axis: int = ...) -> Self | Series: ... - @overload - def min(self, axis: Literal[0] = ...) -> Self: ... - @overload - def min(self, axis: Literal[1]) -> Series: ... - @overload - def min(self, axis: int = ...) -> Self | Series: ... - @overload - def sum(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def sum(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def sum(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - @overload - def mean(self, *, axis: Literal[0] = ..., null_strategy: NullStrategy = ...) -> Self: ... - @overload - def mean(self, *, axis: Literal[1], null_strategy: NullStrategy = ...) -> Series: ... - @overload - def mean(self, *, axis: int = ..., null_strategy: NullStrategy = ...) -> Self | Series: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def median(self) -> Self: ... - def product(self) -> DataFrame: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., *, separator: str = ..., drop_first: bool = ...) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> DataFrame: ... - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: ... - def approx_n_unique(self) -> DataFrame: ... - def approx_unique(self) -> DataFrame: ... - def rechunk(self) -> Self: ... - def null_count(self) -> Self: ... - def sample(self, n: int | Series | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[False] = ...) -> tuple[Any, ...]: ... - @overload - def row(self, index: int | None = ..., *, by_predicate: Expr | None = ..., named: Literal[True]) -> dict[str, Any]: ... - @overload - def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ... - @overload - def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... - def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *, named: bool = ..., include_key: bool = ..., unique: bool = ...) -> dict[Any, Iterable[Any]]: ... - @overload - def iter_rows(self, *, named: Literal[False] = ..., buffer_size: int = ...) -> Iterator[tuple[Any, ...]]: ... - @overload - def iter_rows(self, *, named: Literal[True], buffer_size: int = ...) -> Iterator[dict[str, Any]]: ... - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Self: ... - def take_every(self, n: int) -> DataFrame: ... - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def interpolate(self) -> DataFrame: ... - def is_empty(self) -> bool: ... - def to_struct(self, name: str) -> Series: ... - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def corr(self, **kwargs: Any) -> DataFrame: ... - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> DataFrame: ... - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> GroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> RollingGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> DynamicGroupBy: ... - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ..., *, inference_size: int = ...) -> DataFrame: ... - -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/dataframe/frame.pyi new file mode 100644 index 0000000..d3aa321 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/dataframe/frame.pyi @@ -0,0 +1,6630 @@ +#: version 0.19.7 +import P +import deltalake +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use ``pl.read_csv`` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use ``pl.read_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading ``n_rows``. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading ``n_rows``. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use ``pl.read_json`` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use ``pl.read_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with ``NaN``. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to ``True`` will raise a ``NotImplementedError``. + allow_copy + Allow memory to be copied to perform the conversion. If set to ``False``, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars dataframe to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the dataframe as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to ``df[0,0]``, with a check that + the shape is (1,1). With row/col, this is equivalent to ``df[row,col]``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self, as_series: bool = ...) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + ``structured`` is set to ``False`` and the DataFrame dtypes allow for a + global dtype for all columns. + + Notes + ----- + If you\'re attempting to convert Utf8 to an array you\'ll need to install + ``pyarrow``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ cat │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open ``xlsxwriter.Workbook`` object that has not been closed. + If None, writes to a ``dataframe.xlsx`` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of ``{"key":value,}`` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent ``xlsxwriter`` operations. + column_formats : dict + A ``{colname(s):str,}`` or ``{selector:str,}`` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in ``dtype_formats``. + dtype_formats : dict + A ``{dtype:str,}`` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + ``column_formats`` param). It is also valid to use dtype groups such as + ``pl.FLOAT_DTYPES`` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid ``xlsxwriter`` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all ``xlsxwriter`` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A ``{key:value,}`` dictionary of ``xlsxwriter`` format options to apply + to the table header row, such as ``{"bold":True, "font_color":"#702963"}``. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a ``{colname:funcname,}`` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A ``{colname:int,}`` or ``{selector:int,}`` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a ``{colname:columns,}`` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or ``{row_index:int,}`` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that ``row_index`` starts at zero and will be + the header row (unless ``has_headers`` is False). + sparklines : dict + A ``{colname:list,}`` or ``{colname:dict,}`` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an ``xlsxwriter``-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A ``{colname:formula,}`` or ``{colname:dict,}`` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + has_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible ``xlsxwriter`` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic dataframe: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path to which the IPC data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path to which the IPC record batch data should be written. If set to + ``None``, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to ``pyarrow.parquet.write_table``. + + If you pass ``partition_cols`` here, the dataset will be written + using ``pyarrow.parquet.write_to_dataset``. + The ``partition_cols`` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, table_name: str, connection: str) -> None: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Name of the table to create or append to in the target SQL database. + If your table name contains special characters, it should be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_exists : {\'append\', \'replace\', \'fail\'} + The insert mode. + \'replace\' will create a new database table, overwriting an existing one. + \'append\' will append to an existing table. + \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine used for writing the data. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} + How to handle existing data. + + * If \'error\', throw an error if the table already exists (default). + * If \'append\', will add new data. + * If \'overwrite\', will replace table with new data. + * If \'ignore\', will not write anything if table already exists. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + * See a list of supported storage options for S3 `here `__. + * See a list of supported storage options for GCS `here `__. + * See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Some other data types are not supported but have an associated `primitive type + `__ + to which they can be cast. This affects the following data types: + + - Unsigned integers + - :class:`Datetime` types with millisecond or nanosecond precision or with + time zone information + - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a dataframe as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_at_idx(self, index: int, series: Series) -> Self: + ''' + Insert a Series at a certain column index. This operation is in place. + + Parameters + ---------- + index + Column to insert the new `Series` column. + series + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_at_idx(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_at_idx(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, predicate: Expr | str | Series | list[bool] | np.ndarray[Any, Any] | bool) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") < 3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ + │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ + + ''' + def find_idx_by_name(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.find_idx_by_name("ham") + 2 + + ''' + def replace_at_idx(self, index: int, series: Series) -> Self: + ''' + Replace a column at an index location. + + Parameters + ---------- + index + Column index. + series + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_at_idx(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def frame_equal(self, other: DataFrame) -> bool: + ''' + Check if DataFrame is equal to other. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.frame_equal(df1) + True + >>> df1.frame_equal(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last ``abs(n)``. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The ``GroupBy`` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``group_by_dynamic`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_rolling on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.group_by_rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is ``\'window\'``. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see ``pl.StringCache()``. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: ``udf(row)``. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level ``apply`` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level ``apply`` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, other: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from ``vstack`` which adds the chunks from ``other`` to the chunks of + this ``DataFrame``, ``extend`` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``vstack`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer ``vstack`` over ``extend`` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single ``DataFrame``. In the latter case, finish the sequence of + ``vstack`` operations with a ``rechunk``. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column as Series by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill ``value``. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: Sequence[str] | str | None = ..., value_vars: Sequence[str] | str | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars), while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis, leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Columns to use as identifier variables. + value_vars + Values to use as identifier variables. + If `value_vars` is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> df.melt(id_vars="a", value_vars=["b", "c"]) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to ``None`` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying ``as_dict=True``. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift values by the given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + See Also + -------- + shift_and_fill + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift(periods=1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └──────┴──────┴──────┘ + >>> df.shift(periods=-1) + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞══════╪══════╪══════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with this value. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.shift_and_fill(periods=1, fill_value=0) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def min(self, axis: int = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + >>> df.sum(axis=1) + shape: (3,) + Series: \'foo\' [str] + [ + "16a" + "27b" + "38c" + ] + + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 or 1. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if axis == 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + >>> df.mean(axis=1) + shape: (3,) + Series: \'foo\' [f64] + [ + 2.666667 + 3.0 + 5.5 + ] + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to ``None`` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the ``DataFrame`` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> DataFrame: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`DataFrame.approx_n_unique`. + + """ + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + Int8 + Utf8 = Utf8 + Float32 + Int64 = Float32 + Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The ``index`` and ``by_predicate`` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using ``by_predicate`` it is an error condition if anything other than + one row is returned; more than one row raises ``TooManyRowsReturnedError``, and + zero rows will raise ``NoRowsReturnedError`` (both inherit from ``RowsError``). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of ``iter_rows()`` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify ``named=True`` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use ``by_predicate`` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using ``iter_rows`` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that Python + natively only supports up to ``μs``-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using ``iter_slices`` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def take_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.take_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str) -> Series: + ''' + Convert a ``DataFrame`` to a ``Series`` of type ``Struct``. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy ``corrcoef`` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy ``corrcoef``. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the non-null values in `other`. + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\'} + \'left\' will keep all rows from the left table. Rows may be duplicated if + multiple rows in right frame match left row\'s `on` key. + \'inner\' will remove rows that are not found in other + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is ``\'window\'``. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/expr/expr deleted file mode 100644 index 307425a..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/expr/expr +++ /dev/null @@ -1,276 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import polars._reexport as pl - -from datetime import timedelta -from polars import DataFrame as DataFrame, LazyFrame as LazyFrame, Series as Series -from polars.datatypes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, INTEGER_DTYPES as INTEGER_DTYPES, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8, is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, numpy as np -from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.polars import PyExpr as PyExpr -from polars.type_aliases import ClosedInterval as ClosedInterval, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, IntoExprColumn as IntoExprColumn, MapElementsStrategy as MapElementsStrategy, NullBehavior as NullBehavior, NumericLiteral as NumericLiteral, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, TemporalLiteral as TemporalLiteral, WindowMappingStrategy as WindowMappingStrategy -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Iterable, NoReturn, Sequence, TypeVar -from typing_extensions import Concatenate, Self - -T = TypeVar('T') -P = ParamSpec('P') - -class Expr(Generic[P]): - _pyexpr: PyExpr - _accessors: ClassVar[set[str]] - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _to_expr(self, other: Any) -> Expr: ... - def _repr_html_(self) -> str: ... - def __str__(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int | bool) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int | bool) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr | int | bool) -> Self: ... - def __rxor__(self, other: Any) -> Self: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: ... - @classmethod - def from_json(cls, value: str) -> Self: ... - def to_physical(self) -> Self: ... - def any(self, *, ignore_nulls: bool = ...) -> Self: ... - def all(self, *, ignore_nulls: bool = ...) -> Self: ... - def arg_true(self) -> Self: ... - def sqrt(self) -> Self: ... - def cbrt(self) -> Self: ... - def log10(self) -> Self: ... - def exp(self) -> Self: ... - def alias(self, name: str) -> Self: ... - def map_alias(self, function: Callable[[str], str]) -> Self: ... - def prefix(self, prefix: str) -> Self: ... - def suffix(self, suffix: str) -> Self: ... - def keep_name(self) -> Self: ... - def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: ... - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def is_not(self) -> Self: ... - def not_(self) -> Self: ... - def is_null(self) -> Self: ... - def is_not_null(self) -> Self: ... - def is_finite(self) -> Self: ... - def is_infinite(self) -> Self: ... - def is_nan(self) -> Self: ... - def is_not_nan(self) -> Self: ... - def agg_groups(self) -> Self: ... - def count(self) -> Self: ... - def len(self) -> Self: ... - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: ... - def append(self, other: IntoExpr, *, upcast: bool = ...) -> Self: ... - def rechunk(self) -> Self: ... - def drop_nulls(self) -> Self: ... - def drop_nans(self) -> Self: ... - def cumsum(self, *, reverse: bool = ...) -> Self: ... - def cumprod(self, *, reverse: bool = ...) -> Self: ... - def cummin(self, *, reverse: bool = ...) -> Self: ... - def cummax(self, *, reverse: bool = ...) -> Self: ... - def cumcount(self, *, reverse: bool = ...) -> Self: ... - def floor(self) -> Self: ... - def ceil(self) -> Self: ... - def round(self, decimals: int = ...) -> Self: ... - def dot(self, other: Expr | str) -> Self: ... - def mode(self) -> Self: ... - def cast(self, dtype: PolarsDataType | type[Any], *, strict: bool = ...) -> Self: ... - def sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def top_k(self, k: int | IntoExprColumn = ...) -> Self: ... - def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Self: ... - def arg_max(self) -> Self: ... - def arg_min(self) -> Self: ... - def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: ... - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ...) -> Self: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: ... - def shift(self, periods: int = ...) -> Self: ... - def shift_and_fill(self, fill_value: IntoExpr, *, periods: int = ...) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def forward_fill(self, limit: int | None = ...) -> Self: ... - def backward_fill(self, limit: int | None = ...) -> Self: ... - def reverse(self) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def nan_max(self) -> Self: ... - def nan_min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def product(self) -> Self: ... - def n_unique(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def null_count(self) -> Self: ... - def arg_unique(self) -> Self: ... - def unique(self, *, maintain_order: bool = ...) -> Self: ... - def first(self) -> Self: ... - def last(self) -> Self: ... - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr, mapping_strategy: WindowMappingStrategy = ...) -> Self: ... - def rolling(self, index_column: str, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., check_sorted: bool = ...) -> Self: ... - def is_unique(self) -> Self: ... - def is_first_distinct(self) -> Self: ... - def is_last_distinct(self) -> Self: ... - def is_duplicated(self) -> Self: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., left_closed: bool = ..., include_breaks: bool = ...) -> Self: ... - def qcut(self, quantiles: Sequence[float] | int, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ...) -> Self: ... - def rle(self) -> Self: ... - def rle_id(self) -> Self: ... - def filter(self, predicate: Expr) -> Self: ... - def where(self, predicate: Expr) -> Self: ... - def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def flatten(self) -> Self: ... - def explode(self) -> Self: ... - def implode(self) -> Self: ... - def take_every(self, n: int) -> Self: ... - def head(self, n: int | Expr = ...) -> Self: ... - def tail(self, n: int | Expr = ...) -> Self: ... - def limit(self, n: int | Expr = ...) -> Self: ... - def and_(self, *others: Any) -> Self: ... - def or_(self, *others: Any) -> Self: ... - def eq(self, other: Any) -> Self: ... - def eq_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self: ... - def gt(self, other: Any) -> Self: ... - def le(self, other: Any) -> Self: ... - def lt(self, other: Any) -> Self: ... - def ne(self, other: Any) -> Self: ... - def ne_missing(self, other: Any) -> Self: ... - def add(self, other: Any) -> Self: ... - def floordiv(self, other: Any) -> Self: ... - def mod(self, other: Any) -> Self: ... - def mul(self, other: Any) -> Self: ... - def sub(self, other: Any) -> Self: ... - def truediv(self, other: Any) -> Self: ... - def pow(self, exponent: int | float | None | Series | Expr) -> Self: ... - def xor(self, other: Any) -> Self: ... - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: ... - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: ... - def reinterpret(self, *, signed: bool = ...) -> Self: ... - def inspect(self, fmt: str = ...) -> Self: ... - def interpolate(self, method: InterpolationMethod = ...) -> Self: ... - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ..., ddof: int = ...) -> Self: ... - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., by: str | None = ..., closed: ClosedInterval = ...) -> Self: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Self: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def abs(self) -> Self: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Self: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: ... - def pct_change(self, n: int = ...) -> Self: ... - def skew(self, *, bias: bool = ...) -> Self: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> Self: ... - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: ... - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: ... - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def sign(self) -> Self: ... - def sin(self) -> Self: ... - def cos(self) -> Self: ... - def tan(self) -> Self: ... - def arcsin(self) -> Self: ... - def arccos(self) -> Self: ... - def arctan(self) -> Self: ... - def sinh(self) -> Self: ... - def cosh(self) -> Self: ... - def tanh(self) -> Self: ... - def arcsinh(self) -> Self: ... - def arccosh(self) -> Self: ... - def arctanh(self) -> Self: ... - def degrees(self) -> Self: ... - def radians(self) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Self: ... - def shuffle(self, seed: int | None = ...) -> Self: ... - def sample(self, n: int | Expr | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Self: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Self: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> Self: ... - def unique_counts(self) -> Self: ... - def log(self, base: float = ...) -> Self: ... - def log1p(self) -> Self: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> Self: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Self: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def shrink_dtype(self) -> Self: ... - def cache(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ..., *, agg_list: bool = ...) -> Self: ... - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ..., pass_name: bool = ..., strategy: MapElementsStrategy = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Self: ... - def is_first(self) -> Self: ... - def is_last(self) -> Self: ... - def _register_plugin(self, lib: str, symbol: str, args: list[IntoExpr] | None = ..., *, is_elementwise: bool = ..., input_wildcard_expansion: bool = ..., auto_explode: bool = ..., cast_to_supertypes: bool = ...) -> Self: ... - @property - def bin(self) -> ExprBinaryNameSpace: ... - @property - def cat(self) -> ExprCatNameSpace: ... - @property - def dt(self) -> ExprDateTimeNameSpace: ... - @property - def list(self) -> ExprListNameSpace: ... - @property - def arr(self) -> ExprArrayNameSpace: ... - @property - def meta(self) -> ExprMetaNameSpace: ... - @property - def str(self) -> ExprStringNameSpace: ... - @property - def struct(self) -> ExprStructNameSpace: ... - -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: ... -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/expr/expr.pyi new file mode 100644 index 0000000..c6960c5 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/expr/expr.pyi @@ -0,0 +1,8108 @@ +#: version 0.19.7 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Categorical as Categorical, FLOAT_DTYPES as FLOAT_DTYPES, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, warn_closed_future_change as warn_closed_future_change +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _to_expr(self, other: Any) -> Expr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self) -> Self: + ''' + Return whether all values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map_alias + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + alias + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().map_alias(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").keep_name()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).keep_name()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with ``^`` and end with ``$``. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_not_null().suffix("_not_null")) # nan != null + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point ```NaN`` (Not A Number) should not be confused + with missing data represented as ``Null/None``. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Count the number of values in this expression. + + .. warning:: + `null` is deemed a value in this context. + + Examples + -------- + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def len(self) -> Self: + ''' + Count the number of values in this expression. + + Alias for :func:`count`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().len()) # counts nulls + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + + ''' + def cumsum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumsum(), + ... pl.col("a").cumsum(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 10 │ + │ 3 ┆ 9 │ + │ 6 ┆ 7 │ + │ 10 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cumsum().alias("value_cumsum"), + ... pl.col("values") + ... .cumsum() + ... .forward_fill() + ... .alias("value_cumsum_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cumsum ┆ value_cumsum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumprod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumprod(), + ... pl.col("a").cumprod(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 24 │ + │ 2 ┆ 24 │ + │ 6 ┆ 12 │ + │ 24 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummin(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummin(), + ... pl.col("a").cummin(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴───────────┘ + + ''' + def cummax(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cummax(), + ... pl.col("a").cummax(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 3 ┆ 4 │ + │ 4 ┆ 4 │ + └─────┴───────────┘ + + Null values are excluded, but can also be filled by calling ``forward_fill``. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... [ + ... pl.col("values").cummax().alias("value_cummax"), + ... pl.col("values") + ... .cummax() + ... .forward_fill() + ... .alias("value_cummax_all_filled"), + ... ] + ... ) + shape: (8, 3) + ┌────────┬──────────────┬─────────────────────────┐ + │ values ┆ value_cummax ┆ value_cummax_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════════╪═════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴──────────────┴─────────────────────────┘ + + ''' + def cumcount(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.select( + ... [ + ... pl.col("a").cumcount(), + ... pl.col("a").cumcount(reverse=True).alias("a_reverse"), + ... ] + ... ) + shape: (4, 2) + ┌─────┬───────────┐ + │ a ┆ a_reverse │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═══════════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 2 │ + │ 2 ┆ 1 │ + │ 3 ┆ 0 │ + └─────┴───────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").take(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, periods: int = ...) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.with_columns(foo_shifted=pl.col("foo").shift(1)) + shape: (4, 2) + ┌─────┬─────────────┐ + │ foo ┆ foo_shifted │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════════════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴─────────────┘ + + ''' + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4]}) + >>> df.with_columns(foo_shifted=pl.col("foo").shift_and_fill("a", periods=1)) + shape: (4, 2) + ┌─────┬─────────────┐ + │ foo ┆ foo_shifted │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════════════╡ + │ 1 ┆ a │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴─────────────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns(pl.col("c").max().over("a").suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min")) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def rolling(self, index_column: str) -> Self: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to ``True``, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").filter(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").filter(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for ``map`` functions is transforming the values + represented by an expression using a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Notes + ----- + If you are looking to map a function over a window function or group_by context, + refer to func:`map_elements` instead. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_dict + map_elements + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type ``Callable[[Any], Any]``. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type ``Callable[[Series], Any]``. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be ``pl.Unknown``. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using ``map_elements`` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using ``over`` is considered a GroupBy context + here, so ``map_elements`` can be used to map functions over window groups. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using ``over`` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").take_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator ``expr & other & ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator ``expr | other | ...``. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr == other`` where `None` == None`. + + This differs from default ``eq`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator ``expr >= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator ``expr > other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator ``expr <= other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator ``expr < other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator ``expr != other``. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator ``expr != other`` where `None` == None`. + + This differs from default ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator ``expr + other``. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cumprod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator ``expr // other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator ``expr % other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator ``expr * other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator ``expr - other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cumsum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator ``expr / other``. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator ``expr ** exponent``. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator ``expr ^ other``. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.select([pl.col("optional_members").is_in("sets").alias("contains")]) + shape: (3, 1) + ┌──────────┐ + │ contains │ + │ --- │ + │ bool │ + ╞══════════╡ + │ true │ + │ true │ + │ false │ + └──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with ``lit`` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cumsum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 2 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If ``by`` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a ``by`` column ````, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `group_by_rolling` this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the ``window_size - 1`` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip(1, 10).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 10 │ + └──────┴─────────────┘ + + ''' + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_min(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ 0 │ + │ 5 ┆ 5 │ + │ null ┆ null │ + │ 50 ┆ 50 │ + └──────┴─────────────┘ + + ''' + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [-50, 5, None, 50]}) + >>> df.with_columns(pl.col("foo").clip_max(0).alias("foo_clipped")) + shape: (4, 2) + ┌──────┬─────────────┐ + │ foo ┆ foo_clipped │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════════════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 0 │ + │ null ┆ null │ + │ 50 ┆ 0 │ + └──────┴─────────────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | Expr | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def cache(self) -> Self: + """ + Cache this expression so that it only is executed once per context. + + .. deprecated:: 0.18.9 + This method now does nothing. It has been superseded by the + `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically + caches expressions that are equal. + + """ + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in column according to remapping dictionary. + + Needs a global string cache for lazily evaluated queries on columns of + type ``pl.Categorical``. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + See Also + -------- + map + + Examples + -------- + >>> country_code_dict = { + ... "CA": "Canada", + ... "DE": "Germany", + ... "FR": "France", + ... None: "Not specified", + ... } + >>> df = pl.DataFrame( + ... { + ... "country_code": ["FR", None, "ES", "DE"], + ... } + ... ).with_row_count() + >>> df + shape: (4, 2) + ┌────────┬──────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪══════════════╡ + │ 0 ┆ FR │ + │ 1 ┆ null │ + │ 2 ┆ ES │ + │ 3 ┆ DE │ + └────────┴──────────────┘ + + >>> df.with_columns( + ... pl.col("country_code").map_dict(country_code_dict).alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ null │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + Set a default value for values that cannot be mapped... + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default="unknown") + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ unknown │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by making use of ``pl.first()``: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.first()) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + ...or keep the original value, by explicitly referring to the column: + + >>> df.with_columns( + ... pl.col("country_code") + ... .map_dict(country_code_dict, default=pl.col("country_code")) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬───────────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ str │ + ╞════════╪══════════════╪═══════════════╡ + │ 0 ┆ FR ┆ France │ + │ 1 ┆ null ┆ Not specified │ + │ 2 ┆ ES ┆ ES │ + │ 3 ┆ DE ┆ Germany │ + └────────┴──────────────┴───────────────┘ + + If you need to access different columns to set a default value, a struct needs + to be constructed; in the first field is the column that you want to remap and + the rest of the fields are the other columns used in the default expression. + + >>> df.with_columns( + ... pl.struct(pl.col(["country_code", "row_nr"])).map_dict( + ... remapping=country_code_dict, + ... default=pl.col("row_nr").cast(pl.Utf8), + ... ) + ... ) + shape: (4, 2) + ┌────────┬───────────────┐ + │ row_nr ┆ country_code │ + │ --- ┆ --- │ + │ u32 ┆ str │ + ╞════════╪═══════════════╡ + │ 0 ┆ France │ + │ 1 ┆ Not specified │ + │ 2 ┆ 2 │ + │ 3 ┆ Germany │ + └────────┴───────────────┘ + + Override return dtype: + + >>> df.with_columns( + ... pl.col("row_nr") + ... .map_dict({1: 7, 3: 4}, default=3, return_dtype=pl.UInt8) + ... .alias("remapped") + ... ) + shape: (4, 3) + ┌────────┬──────────────┬──────────┐ + │ row_nr ┆ country_code ┆ remapped │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ str ┆ u8 │ + ╞════════╪══════════════╪══════════╡ + │ 0 ┆ FR ┆ 3 │ + │ 1 ┆ null ┆ 7 │ + │ 2 ┆ ES ┆ 3 │ + │ 3 ┆ DE ┆ 4 │ + └────────┴──────────────┴──────────┘ + + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + ``polars.Unknown``. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def _register_plugin(self, lib: str, symbol: str, args: list[IntoExpr] | None = ...) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by ``lib::symbol`` + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + auto_explode + Explode the results in a group_by. + This is recommended for aggregation functions. + cast_to_supertypes + Cast the input datatypes to their supertype. + + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/lazyframe/frame deleted file mode 100644 index 10ae489..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/lazyframe/frame +++ /dev/null @@ -1,159 +0,0 @@ -from typing_extensions import ParamSpec, Generic -import pyarrow as pa - -from datetime import timedelta -from io import IOBase -from pathlib import Path -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Boolean as Boolean, Categorical as Categorical, DTYPE_TEMPORAL_UNITS as DTYPE_TEMPORAL_UNITS, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, N_INFER_DEFAULT as N_INFER_DEFAULT, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import dataframe_api_compat as dataframe_api_compat, subprocess as subprocess -from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy -from polars.polars import PyLazyFrame as PyLazyFrame -from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.type_aliases import AsofJoinStrategy as AsofJoinStrategy, ClosedInterval as ClosedInterval, ColumnNameOrSelector as ColumnNameOrSelector, CsvEncoding as CsvEncoding, CsvQuoteStyle as CsvQuoteStyle, FillNullStrategy as FillNullStrategy, FrameInitTypes as FrameInitTypes, IntoExpr as IntoExpr, JoinStrategy as JoinStrategy, JoinValidation as JoinValidation, Label as Label, Orientation as Orientation, ParallelStrategy as ParallelStrategy, PolarsDataType as PolarsDataType, RollingInterpolationMethod as RollingInterpolationMethod, SchemaDefinition as SchemaDefinition, SchemaDict as SchemaDict, StartBy as StartBy, UniqueKeepStrategy as UniqueKeepStrategy -from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalize_filepath as normalize_filepath -from typing import Any, Awaitable, Callable, ClassVar, Collection, Concatenate, Iterable, Literal, Mapping, NoReturn, Sequence, TypeVar, overload -from typing_extensions import Self - -T = TypeVar('T') -P = ParamSpec('P') - -class LazyFrame(Generic[P]): - _ldf: PyLazyFrame - _accessors: ClassVar[set[str]] - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ..., *, schema_overrides: SchemaDict | None = ..., orient: Orientation | None = ..., infer_schema_length: int | None = ..., nan_to_null: bool = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - @classmethod - def _scan_csv(cls, source: str, *, has_header: bool = ..., separator: str = ..., comment_char: str | None = ..., quote_char: str | None = ..., skip_rows: int = ..., dtypes: SchemaDict | None = ..., schema: SchemaDict | None = ..., null_values: str | Sequence[str] | dict[str, str] | None = ..., missing_utf8_is_empty_string: bool = ..., ignore_errors: bool = ..., cache: bool = ..., with_column_names: Callable[[list[str]], list[str]] | None = ..., infer_schema_length: int | None = ..., n_rows: int | None = ..., encoding: CsvEncoding = ..., low_memory: bool = ..., rechunk: bool = ..., skip_rows_after_header: int = ..., row_count_name: str | None = ..., row_count_offset: int = ..., try_parse_dates: bool = ..., eol_char: str = ..., raise_if_empty: bool = ..., truncate_ragged_lines: bool = ...) -> Self: ... - @classmethod - def _scan_parquet(cls, source: str, *, n_rows: int | None = ..., cache: bool = ..., parallel: ParallelStrategy = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., low_memory: bool = ..., use_statistics: bool = ..., hive_partitioning: bool = ..., retries: int = ...) -> Self: ... - @classmethod - def _scan_ipc(cls, source: str | Path, *, n_rows: int | None = ..., cache: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ..., storage_options: dict[str, object] | None = ..., memory_map: bool = ...) -> Self: ... - @classmethod - def _scan_ndjson(cls, source: str, *, infer_schema_length: int | None = ..., batch_size: int | None = ..., n_rows: int | None = ..., low_memory: bool = ..., rechunk: bool = ..., row_count_name: str | None = ..., row_count_offset: int = ...) -> Self: ... - @classmethod - def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any, *, pyarrow: bool = ...) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: ... - @classmethod - def read_json(cls, source: str | Path | IOBase) -> Self: ... - @classmethod - def deserialize(cls, source: str | Path | IOBase) -> Self: ... - @property - def columns(self) -> list[str]: ... - @property - def dtypes(self) -> list[PolarsDataType]: ... - @property - def schema(self) -> SchemaDict: ... - def __dataframe_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - @property - def width(self) -> int: ... - def __bool__(self) -> NoReturn: ... - def _comparison_error(self, operator: str) -> NoReturn: ... - def __eq__(self, other: Any) -> NoReturn: ... - def __ne__(self, other: Any) -> NoReturn: ... - def __gt__(self, other: Any) -> NoReturn: ... - def __lt__(self, other: Any) -> NoReturn: ... - def __ge__(self, other: Any) -> NoReturn: ... - def __le__(self, other: Any) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def _repr_html_(self) -> str: ... - @overload - def serialize(self, file: None = ...) -> str: ... - @overload - def serialize(self, file: IOBase | str | Path) -> None: ... - @overload - def write_json(self, file: None = ...) -> str: ... - @overload - def write_json(self, file: IOBase | str | Path) -> None: ... - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: ... - def explain(self, *, optimized: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str: ... - def show_graph(self, *, optimized: bool = ..., show: bool = ..., output_path: str | Path | None = ..., raw_output: bool = ..., figsize: tuple[float, float] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> str | None: ... - def inspect(self, fmt: str = ...) -> Self: ... - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def top_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def bottom_k(self, k: int, *, by: IntoExpr | Iterable[IntoExpr], descending: bool | Sequence[bool] = ..., nulls_last: bool = ..., maintain_order: bool = ...) -> Self: ... - def profile(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., show_plot: bool = ..., truncate_nodes: int = ..., figsize: tuple[int, int] = ..., streaming: bool = ...) -> tuple[DataFrame, DataFrame]: ... - def collect(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ..., **kwargs: Any) -> DataFrame: ... - @overload - def collect_async(self, *, gevent: Literal[True], type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> _GeventDataFrameResult[DataFrame]: ... - @overload - def collect_async(self, *, gevent: Literal[False] = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> Awaitable[DataFrame]: ... - def sink_parquet(self, path: str | Path, *, compression: str = ..., compression_level: int | None = ..., statistics: bool = ..., row_group_size: int | None = ..., data_pagesize_limit: int | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_ipc(self, path: str | Path, *, compression: str | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def sink_csv(self, path: str | Path, *, has_header: bool = ..., separator: str = ..., line_terminator: str = ..., quote: str = ..., batch_size: int = ..., datetime_format: str | None = ..., date_format: str | None = ..., time_format: str | None = ..., float_precision: int | None = ..., null_value: str | None = ..., quote_style: CsvQuoteStyle | None = ..., maintain_order: bool = ..., type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> DataFrame: ... - def _set_sink_optimizations(self, *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ...) -> PyLazyFrame: ... - def fetch(self, n_rows: int = ..., *, type_coercion: bool = ..., predicate_pushdown: bool = ..., projection_pushdown: bool = ..., simplify_expression: bool = ..., no_optimization: bool = ..., slice_pushdown: bool = ..., comm_subplan_elim: bool = ..., comm_subexpr_elim: bool = ..., streaming: bool = ...) -> DataFrame: ... - def lazy(self) -> Self: ... - def cache(self) -> Self: ... - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, *, strict: bool = ...) -> Self: ... - def clear(self, n: int = ...) -> LazyFrame: ... - def clone(self) -> Self: ... - def filter(self, predicate: IntoExpr) -> Self: ... - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def group_by_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def group_by_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool | None = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., label: Label = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def join_asof(self, other: LazyFrame, *, left_on: str | None | Expr = ..., right_on: str | None | Expr = ..., on: str | None | Expr = ..., by_left: str | Sequence[str] | None = ..., by_right: str | Sequence[str] | None = ..., by: str | Sequence[str] | None = ..., strategy: AsofJoinStrategy = ..., suffix: str = ..., tolerance: str | int | float | timedelta | None = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ..., *, left_on: str | Expr | Sequence[str | Expr] | None = ..., right_on: str | Expr | Sequence[str | Expr] | None = ..., suffix: str = ..., validate: JoinValidation = ..., allow_parallel: bool = ..., force_parallel: bool = ...) -> Self: ... - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: ... - def with_context(self, other: Self | list[Self]) -> Self: ... - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: ... - def rename(self, mapping: dict[str, str]) -> Self: ... - def reverse(self) -> Self: ... - def shift(self, periods: int) -> Self: ... - def shift_and_fill(self, fill_value: Expr | int | str | float, *, periods: int = ...) -> Self: ... - def slice(self, offset: int, length: int | None = ...) -> Self: ... - def limit(self, n: int = ...) -> Self: ... - def head(self, n: int = ...) -> Self: ... - def tail(self, n: int = ...) -> Self: ... - def last(self) -> Self: ... - def first(self) -> Self: ... - def approx_n_unique(self) -> Self: ... - def approx_unique(self) -> Self: ... - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: ... - def take_every(self, n: int) -> Self: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ..., *, matches_supertype: bool = ...) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Self: ... - def std(self, ddof: int = ...) -> Self: ... - def var(self, ddof: int = ...) -> Self: ... - def max(self) -> Self: ... - def min(self) -> Self: ... - def sum(self) -> Self: ... - def mean(self) -> Self: ... - def median(self) -> Self: ... - def null_count(self) -> Self: ... - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: ... - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: ... - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ..., *, keep: UniqueKeepStrategy = ..., maintain_order: bool = ...) -> Self: ... - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: ... - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ..., *, streamable: bool = ...) -> Self: ... - def map_batches(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... - def interpolate(self) -> Self: ... - def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: ... - def merge_sorted(self, other: LazyFrame, key: str) -> Self: ... - def set_sorted(self, column: str | Iterable[str], *more_columns: str, descending: bool = ...) -> Self: ... - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: ... - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr, maintain_order: bool = ...) -> LazyGroupBy: ... - def groupby_rolling(self, index_column: IntoExpr, *, period: str | timedelta, offset: str | timedelta | None = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def groupby_dynamic(self, index_column: IntoExpr, *, every: str | timedelta, period: str | timedelta | None = ..., offset: str | timedelta | None = ..., truncate: bool = ..., include_boundaries: bool = ..., closed: ClosedInterval = ..., by: IntoExpr | Iterable[IntoExpr] | None = ..., start_by: StartBy = ..., check_sorted: bool = ...) -> LazyGroupBy: ... - def map(self, function: Callable[[DataFrame], DataFrame], *, predicate_pushdown: bool = ..., projection_pushdown: bool = ..., slice_pushdown: bool = ..., no_optimizations: bool = ..., schema: None | SchemaDict = ..., validate_output_schema: bool = ..., streamable: bool = ...) -> Self: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..3f922c8 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/lazyframe/frame.pyi @@ -0,0 +1,4012 @@ +#: version 0.19.7 +import P +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, find_stacklevel as find_stacklevel, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use ``pl.scan_csv`` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use ``pl.scan_parquet`` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use ``pl.scan_ipc`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use ``pl.scan_ndjson`` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | dict[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def from_json(cls, json: str) -> Self: + """ + Read a logical plan from a JSON string to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This method is deprecated. Convert the JSON string to ``StringIO`` + and then use ``LazyFrame.deserialize``. + + Parameters + ---------- + json + String in JSON format. + + See Also + -------- + deserialize + + """ + @classmethod + def read_json(cls, source: str | Path | IOBase) -> Self: + """ + Read a logical plan from a JSON file to construct a LazyFrame. + + .. deprecated:: 0.18.12 + This class method has been renamed to ``deserialize``. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + deserialize + + """ + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a ``read()`` method, such as a file handler (e.g. + via builtin ``open`` function) or ``BytesIO``). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + """ + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.serialize`. + + Parameters + ---------- + file + File path to which the result should be written. If set to ``None`` + (default), the output is returned as a string instead. + """ + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to ``True``. + If this is set to ``True`` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.select(pl.col("foo").cumsum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self, **kwargs: Any) -> DataFrame: + ''' + Collect into a DataFrame. + + Note: use :func:`fetch` if you want to run your query on the first `n` rows + only. This can be a huge time saver in debugging queries. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + **kwargs + For internal use. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + dataframe directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + ... + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + has_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both ``Float32`` and + ``Float64`` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that ``fetch`` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if ``n_rows`` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, predicate: IntoExpr) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicate + Expression that evaluates to a boolean Series. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") < 3).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call ``agg`` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set ``maintain_order=True`` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a ``dynamic_group_by`` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ````, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_rolling on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.group_by_rolling(index_column="dt", period="2d") + ... .agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + ... .collect() + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date (e.g. 2022-02-29 -> 2022-02-28) + instead of erroring. + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + Suffix with `"_saturating"` to indicate that dates too large for + their month should saturate at the largest date + (e.g. 2022-02-29 -> 2022-02-28) instead of erroring. + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} + Join strategy. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 4) + ┌──────┬──────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞══════╪══════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ null ┆ null ┆ d ┆ z │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └──────┴──────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the experimental setting ``Config.set_auto_structify(True)``: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another dataframe: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context(train_lf.select(pl.all().suffix("_train"))).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the dataframe. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If names are swapped. E.g. \'A\' points to \'B\' and \'B\' points to \'A\', polars + will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, periods: int) -> Self: + ''' + Shift the values by a given period. + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift(periods=1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └──────┴──────┘ + >>> lf.shift(periods=-1).collect() + shape: (3, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + ''' + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.shift_and_fill(fill_value=0, periods=1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 2 │ + │ 3 ┆ 4 │ + └─────┴─────┘ + >>> lf.shift_and_fill(periods=-1, fill_value=0).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 4 │ + │ 5 ┆ 6 │ + │ 0 ┆ 0 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_unique(self) -> Self: + """ + Approximate count of unique values. + + .. deprecated:: 0.18.12 + This method has been renamed to :func:`LazyFrame.approx_n_unique`. + + """ + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def take_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.take_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill ``value`` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("NaN"), 4], + ... "b": [0.5, 4, float("NaN"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to ``None`` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to ``None`` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The ``schema`` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, ``predicate_pushdown`` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌─────┬──────┬─────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 1 ┆ 6 ┆ 1 │ + │ 5 ┆ 7 ┆ 3 │ + │ 9 ┆ 9 ┆ 6 │ + │ 10 ┆ null ┆ 9 │ + └─────┴──────┴─────┘ + + ''' + def unnest(self, columns: str | Sequence[str], *more_columns: str) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + how : {\'left\', \'inner\'} + \'left\' will keep all rows from the left table. Rows may be duplicated if + multiple rows in right frame match left row\'s `on` key. + \'inner\' will remove rows that are not found in other + + Notes + ----- + This is syntactic sugar for a left/inner join + coalesce + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [4, None, 6], + ... "C": [7, 8, 9], + ... } + ... ) + >>> new_df + shape: (3, 2) + ┌──────┬─────┐ + │ B ┆ C │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 4 ┆ 7 │ + │ null ┆ 8 │ + │ 6 ┆ 9 │ + └──────┴─────┘ + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 500 │ + │ 3 ┆ 6 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to ``True`` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is ``\'window\'``. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains ``\'w\'``): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the ``by`` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to ``False``. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call ``.agg`` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to ``None`` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to ``False`` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/series/series deleted file mode 100644 index 5c1f0fe..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/series/series +++ /dev/null @@ -1,366 +0,0 @@ - -from datetime import date, datetime, timedelta -from polars import DataFrame as DataFrame, Expr as Expr -from polars.datatypes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, INTEGER_DTYPES as INTEGER_DTYPES, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8, dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _PYARROW_AVAILABLE as _PYARROW_AVAILABLE, _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, numpy as np, pandas as pd, pyarrow as pa -from polars.exceptions import ShapeError as ShapeError -from polars.polars import PyDataFrame as PyDataFrame, PySeries as PySeries -from polars.series._numpy import SeriesView as SeriesView -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.type_aliases import ClosedInterval as ClosedInterval, ComparisonOperator as ComparisonOperator, FillNullStrategy as FillNullStrategy, InterpolationMethod as InterpolationMethod, IntoExpr as IntoExpr, IntoExprColumn as IntoExprColumn, NullBehavior as NullBehavior, NumericLiteral as NumericLiteral, OneOrMoreDataTypes as OneOrMoreDataTypes, PolarsDataType as PolarsDataType, PythonLiteral as PythonLiteral, RankMethod as RankMethod, RollingInterpolationMethod as RollingInterpolationMethod, SearchSortedSide as SearchSortedSide, SizeUnit as SizeUnit, TemporalLiteral as TemporalLiteral -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time -from polars.utils.deprecation import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar, Collection, Generator, Literal, NoReturn, Sequence, overload -from typing_extensions import Self - -ArrayLike = Union[Sequence[Any], "Series", "pa.Array", "pa.ChunkedArray", "np.ndarray", "pd.Series", "pd.DatetimeIndex"] - -class Series: - _s: PySeries - _accessors: ClassVar[set[str]] - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ..., *, strict: bool = ..., nan_to_null: bool = ..., dtype_if_empty: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array, *, rechunk: bool = ...) -> Self: ... - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex, *, nan_to_null: bool = ...) -> Self: ... - def _get_ptr(self) -> tuple[int, int, int]: ... - @property - def dtype(self) -> PolarsDataType: ... - @property - def flags(self) -> dict[str, bool]: ... - @property - def inner_dtype(self) -> PolarsDataType | None: ... - @property - def name(self) -> str: ... - @property - def shape(self) -> tuple[int]: ... - def __bool__(self) -> NoReturn: ... - def __getstate__(self) -> bytes: ... - def __setstate__(self, state: bytes) -> None: ... - def __str__(self) -> str: ... - def __repr__(self) -> str: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Series: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... - @overload - def __eq__(self, other: Expr) -> Expr: ... - @overload - def __eq__(self, other: Any) -> Series: ... - @overload - def __ne__(self, other: Expr) -> Expr: ... - @overload - def __ne__(self, other: Any) -> Series: ... - @overload - def __gt__(self, other: Expr) -> Expr: ... - @overload - def __gt__(self, other: Any) -> Series: ... - @overload - def __lt__(self, other: Expr) -> Expr: ... - @overload - def __lt__(self, other: Any) -> Series: ... - @overload - def __ge__(self, other: Expr) -> Expr: ... - @overload - def __ge__(self, other: Any) -> Series: ... - @overload - def __le__(self, other: Expr) -> Expr: ... - @overload - def __le__(self, other: Any) -> Series: ... - def le(self, other: Any) -> Self | Expr: ... - def lt(self, other: Any) -> Self | Expr: ... - def eq(self, other: Any) -> Self | Expr: ... - @overload - def eq_missing(self, other: Any) -> Self: ... - @overload - def eq_missing(self, other: Expr) -> Expr: ... - def ne(self, other: Any) -> Self | Expr: ... - @overload - def ne_missing(self, other: Expr) -> Expr: ... - @overload - def ne_missing(self, other: Any) -> Self: ... - def ge(self, other: Any) -> Self | Expr: ... - def gt(self, other: Any) -> Self | Expr: ... - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - @overload - def __add__(self, other: DataFrame) -> DataFrame: ... - @overload - def __add__(self, other: Expr) -> Expr: ... - @overload - def __add__(self, other: Any) -> Self: ... - @overload - def __sub__(self, other: Expr) -> Expr: ... - @overload - def __sub__(self, other: Any) -> Self: ... - @overload - def __truediv__(self, other: Expr) -> Expr: ... - @overload - def __truediv__(self, other: Any) -> Series: ... - @overload - def __floordiv__(self, other: Expr) -> Expr: ... - @overload - def __floordiv__(self, other: Any) -> Series: ... - def __invert__(self) -> Series: ... - @overload - def __mul__(self, other: Expr) -> Expr: ... - @overload - def __mul__(self, other: DataFrame) -> DataFrame: ... - @overload - def __mul__(self, other: Any) -> Series: ... - @overload - def __mod__(self, other: Expr) -> Expr: ... - @overload - def __mod__(self, other: Any) -> Series: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - @overload - def __getitem__(self, item: int) -> Any: ... - @overload - def __getitem__(self, item: Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Series: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: ... - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: ... - def __column_consortium_standard__(self, *, api_version: str | None = ...) -> Any: ... - def _repr_html_(self) -> str: ... - def item(self, index: int | None = ...) -> Any: ... - def estimated_size(self, unit: SizeUnit = ...) -> int | float: ... - def sqrt(self) -> Series: ... - def cbrt(self) -> Series: ... - @overload - def any(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def any(self, *, ignore_nulls: bool) -> bool | None: ... - @overload - def all(self, *, ignore_nulls: Literal[True] = ...) -> bool: ... - @overload - def all(self, *, ignore_nulls: bool) -> bool | None: ... - def log(self, base: float = ...) -> Series: ... - def log1p(self) -> Series: ... - def log10(self) -> Series: ... - def exp(self) -> Series: ... - def drop_nulls(self) -> Series: ... - def drop_nans(self) -> Series: ... - def to_frame(self, name: str | None = ...) -> DataFrame: ... - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: ... - def sum(self) -> int | float: ... - def mean(self) -> int | float | None: ... - def product(self) -> int | float: ... - def pow(self, exponent: int | float | None | Series) -> Series: ... - def min(self) -> PythonLiteral | None: ... - def max(self) -> PythonLiteral | None: ... - def nan_max(self) -> int | float | date | datetime | timedelta | str: ... - def nan_min(self) -> int | float | date | datetime | timedelta | str: ... - def std(self, ddof: int = ...) -> float | None: ... - def var(self, ddof: int = ...) -> float | None: ... - def median(self) -> float | None: ... - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: ... - def to_dummies(self, separator: str = ...) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def cut(self, breaks: Sequence[float], labels: Sequence[str] | None = ..., break_point_label: str = ..., category_label: str = ..., *, left_closed: bool = ..., include_breaks: bool = ..., as_series: bool) -> Series | DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[True] = ...) -> Series: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: Literal[False]) -> DataFrame: ... - @overload - def qcut(self, quantiles: Sequence[float] | int, *, labels: Sequence[str] | None = ..., left_closed: bool = ..., allow_duplicates: bool = ..., include_breaks: bool = ..., break_point_label: str = ..., category_label: str = ..., as_series: bool) -> Series | DataFrame: ... - def rle(self) -> Series: ... - def rle_id(self) -> Series: ... - def hist(self, bins: list[float] | None = ..., *, bin_count: int | None = ...) -> DataFrame: ... - def value_counts(self, *, sort: bool = ..., parallel: bool = ...) -> DataFrame: ... - def unique_counts(self) -> Series: ... - def entropy(self, base: float = ..., *, normalize: bool = ...) -> float | None: ... - def cumulative_eval(self, expr: Expr, min_periods: int = ..., *, parallel: bool = ...) -> Series: ... - def alias(self, name: str) -> Series: ... - def rename(self, name: str) -> Series: ... - def chunk_lengths(self) -> list[int]: ... - def n_chunks(self) -> int: ... - def cummax(self, *, reverse: bool = ...) -> Series: ... - def cummin(self, *, reverse: bool = ...) -> Series: ... - def cumprod(self, *, reverse: bool = ...) -> Series: ... - def cumsum(self, *, reverse: bool = ...) -> Series: ... - def slice(self, offset: int, length: int | None = ...) -> Series: ... - def append(self, other: Series, *, append_chunks: bool | None = ...) -> Self: ... - def extend(self, other: Series) -> Self: ... - def filter(self, predicate: Series | list[bool]) -> Self: ... - def head(self, n: int = ...) -> Series: ... - def tail(self, n: int = ...) -> Series: ... - def limit(self, n: int = ...) -> Series: ... - def take_every(self, n: int) -> Series: ... - def sort(self, *, descending: bool = ..., in_place: bool = ...) -> Self: ... - def top_k(self, k: int | IntoExprColumn = ...) -> Series: ... - def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: ... - def arg_sort(self, *, descending: bool = ..., nulls_last: bool = ...) -> Series: ... - def arg_unique(self) -> Series: ... - def arg_min(self) -> int | None: ... - def arg_max(self) -> int | None: ... - @overload - def search_sorted(self, element: int | float, side: SearchSortedSide = ...) -> int: ... - @overload - def search_sorted(self, element: Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> Series: ... - def unique(self, *, maintain_order: bool = ...) -> Series: ... - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: ... - def null_count(self) -> int: ... - def has_validity(self) -> bool: ... - def is_empty(self) -> bool: ... - def is_sorted(self, *, descending: bool = ...) -> bool: ... - def not_(self) -> Series: ... - def is_null(self) -> Series: ... - def is_not_null(self) -> Series: ... - def is_finite(self) -> Series: ... - def is_infinite(self) -> Series: ... - def is_nan(self) -> Series: ... - def is_not_nan(self) -> Series: ... - def is_in(self, other: Series | Collection[Any]) -> Series: ... - def arg_true(self) -> Series: ... - def is_unique(self) -> Series: ... - def is_first_distinct(self) -> Series: ... - def is_last_distinct(self) -> Series: ... - def is_duplicated(self) -> Series: ... - def explode(self) -> Series: ... - def series_equal(self, other: Series, *, null_equal: bool = ..., strict: bool = ...) -> bool: ... - def len(self) -> int: ... - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool], *, strict: bool = ...) -> Self: ... - def to_physical(self) -> Series: ... - def to_list(self, *, use_pyarrow: bool = ...) -> list[Any]: ... - def rechunk(self, *, in_place: bool = ...) -> Self: ... - def reverse(self) -> Series: ... - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: ... - def is_numeric(self) -> bool: ... - def is_integer(self, signed: bool | None = ...) -> bool: ... - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: ... - def is_float(self) -> bool: ... - def is_boolean(self) -> bool: ... - def is_utf8(self) -> bool: ... - def view(self, *, ignore_nulls: bool = ...) -> SeriesView: ... - def to_numpy(self, *args: Any, zero_copy_only: bool = ..., writable: bool = ..., use_pyarrow: bool = ...) -> np.ndarray[Any, Any]: ... - def to_arrow(self) -> pa.Array: ... - def to_pandas(self, *args: Any, use_pyarrow_extension_array: bool = ..., **kwargs: Any) -> pd.Series[Any]: ... - def to_init_repr(self, n: int = ...) -> str: ... - def set(self, filter: Series, value: int | float | str) -> Series: ... - def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: ... - def clear(self, n: int = ...) -> Series: ... - def clone(self) -> Self: ... - def fill_nan(self, value: int | float | Expr | None) -> Series: ... - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: ... - def floor(self) -> Series: ... - def ceil(self) -> Series: ... - def round(self, decimals: int = ...) -> Series: ... - def dot(self, other: Series | ArrayLike) -> float | None: ... - def mode(self) -> Series: ... - def sign(self) -> Series: ... - def sin(self) -> Series: ... - def cos(self) -> Series: ... - def tan(self) -> Series: ... - def arcsin(self) -> Series: ... - def arccos(self) -> Series: ... - def arctan(self) -> Series: ... - def arcsinh(self) -> Series: ... - def arccosh(self) -> Series: ... - def arctanh(self) -> Series: ... - def sinh(self) -> Series: ... - def cosh(self) -> Series: ... - def tanh(self) -> Series: ... - def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def shift(self, periods: int = ...) -> Series: ... - def shift_and_fill(self, fill_value: int | Expr, *, periods: int = ...) -> Series: ... - def zip_with(self, mask: Series, other: Series) -> Self: ... - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ..., ddof: int = ...) -> Series: ... - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def rolling_skew(self, window_size: int, *, bias: bool = ...) -> Series: ... - def sample(self, n: int | None = ..., *, fraction: float | None = ..., with_replacement: bool = ..., shuffle: bool = ..., seed: int | None = ...) -> Series: ... - def peak_max(self) -> Self: ... - def peak_min(self) -> Self: ... - def n_unique(self) -> int: ... - def shrink_to_fit(self, *, in_place: bool = ...) -> Series: ... - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: ... - def reinterpret(self, *, signed: bool = ...) -> Series: ... - def interpolate(self, method: InterpolationMethod = ...) -> Series: ... - def abs(self) -> Series: ... - def rank(self, method: RankMethod = ..., *, descending: bool = ..., seed: int | None = ...) -> Series: ... - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: ... - def pct_change(self, n: int = ...) -> Series: ... - def skew(self, *, bias: bool = ...) -> float | None: ... - def kurtosis(self, *, fisher: bool = ..., bias: bool = ...) -> float | None: ... - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: ... - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: ... - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: ... - def lower_bound(self) -> Self: ... - def upper_bound(self) -> Self: ... - def map_dict(self, remapping: dict[Any, Any], *, default: Any = ..., return_dtype: PolarsDataType | None = ...) -> Self: ... - def reshape(self, dimensions: tuple[int, ...]) -> Series: ... - def shuffle(self, seed: int | None = ...) -> Series: ... - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ..., *, adjust: bool = ..., bias: bool = ..., min_periods: int = ..., ignore_nulls: bool = ...) -> Series: ... - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: ... - def set_sorted(self, *, descending: bool = ...) -> Self: ... - def new_from_index(self, index: int, length: int) -> Self: ... - def shrink_dtype(self) -> Series: ... - def get_chunks(self) -> list[Series]: ... - def implode(self) -> Self: ... - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ..., *, skip_nulls: bool = ...) -> Self: ... - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ..., *, center: bool = ...) -> Series: ... - def is_first(self) -> Series: ... - def is_last(self) -> Series: ... - @property - def bin(self) -> BinaryNameSpace: ... - @property - def cat(self) -> CatNameSpace: ... - @property - def dt(self) -> DateTimeNameSpace: ... - @property - def list(self) -> ListNameSpace: ... - @property - def arr(self) -> ArrayNameSpace: ... - @property - def str(self) -> StringNameSpace: ... - @property - def struct(self) -> StructNameSpace: ... - -def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/series/series.pyi new file mode 100644 index 0000000..64e9437 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.19.7/polars/series/series.pyi @@ -0,0 +1,4713 @@ +#: version 0.19.7 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, FLOAT_DTYPES as FLOAT_DTYPES, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, SIGNED_INTEGER_DTYPES as SIGNED_INTEGER_DTYPES, TEMPORAL_DTYPES as TEMPORAL_DTYPES, Time as Time, UInt32 as UInt32, UInt64 as UInt64, UNSIGNED_INTEGER_DTYPES as UNSIGNED_INTEGER_DTYPES, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time +from polars.utils.deprecation import deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the ``Series`` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series <= other``.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series < other``.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series == other``.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator ``series == other`` where `None` == None`. + + This differs from the standard ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series != other``.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator ``series != other`` where `None` == None`. + + This differs from the standard ``ne`` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series >= other``.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression ``series > other``.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + ''' + Return the series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to ``s[0]``, with a check + that the shape is (1,). With an index, this is equivalent to ``s[index]``. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cumsum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are ``True``. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to ``False``, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no ``True`` values, + the output is ``None``. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting ``ignore_nulls=False``. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + series has a numeric dtype). All values must be in the range `[0, 1]`. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> series_num = pl.Series([1, 2, 3, 4, 5]) + >>> series_num.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + >>> series_str = pl.Series(["a", "a", None, "b", "c"]) + >>> series_str.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 5 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + category_label + Name of the category column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to ``False``, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting ``include_breaks=True``, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to ``True``, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + category_label + Name of the category column. Only used if ``include_breaks`` is set to + ``True``. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use ``Series.struct.rename_fields`` to + rename the field instead. + as_series + If set to ``False``, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting ``include_breaks=True``, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if ``include_breaks`` is set to + ``False`` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬─────────┐ + │ break_point ┆ category ┆ a_count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═════════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴─────────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to ``False`` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴────────┘ + + Sort the output by count. + + shape: (3, 2) + ┌───────┬────────┐ + │ color ┆ counts │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪════════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴────────┘ + + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula ``-sum(pk * log(pk)`` where ``pk`` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cummax(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cummax() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cummin(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cummin() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cumprod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumprod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cumsum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cumsum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to ``None``, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + Parameters + ---------- + other + Series to append. + append_chunks + .. deprecated:: 0.18.8 + This argument will be removed and ``append`` will change to always + behave like ``append_chunks=True`` (the previous default). For the + behavior of ``append_chunks=False``, use ``Series.extend``. + + If set to `True` the append operation will add the chunks from `other` to + self. This is super cheap. + + If set to `False` the append operation will do the same as + `DataFrame.extend` which extends the memory backed by this `Series` with + the values from `other`. + + Different from `append chunks`, `extend` appends the data from `other` to + the underlying memory locations and thus may cause a reallocation (which are + expensive). + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a + single append. For instance during online operations where you add `n` rows + and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times + before doing a query. For instance when you read in multiple files and when + to store them in a single `Series`. In the latter case, finish the sequence + of `append_chunks` operations with a `rechunk`. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from ``append``, which adds the chunks from ``other`` to the chunks of + this series, ``extend`` appends the data from ``other`` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer ``extend`` over ``append`` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer ``append`` over ``extend`` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single ``Series``. In the latter case, finish the sequence + of ``append`` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last ``abs(n)``. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first ``abs(n)``. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first ``abs(n)``. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last ``abs(n)``. + + See Also + -------- + head + + """ + def take_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.take([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no ``null`` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have ``null`` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be ``false``. + + To confirm that a column has ``null`` values use :func:`null_count`. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.NaN]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def series_equal(self, other: Series) -> bool: + ''' + Check if series is equal with another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s.series_equal(s) + True + >>> s.series_equal(s2) + False + + ''' + def len(self) -> int: + ''' + Length of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.len() + 3 + + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - ``List(inner)`` -> ``List(physical of inner)`` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the ``closed`` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() + True + >>> s.is_integer(signed=False) + True + >>> s.is_integer(signed=True) + False + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() + True + >>> s.is_temporal(excluding=[pl.Date]) + False + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() + True + + ''' + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() + True + + ''' + def view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + Don\'t use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s.view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point ``nan`` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set ``zero_copy_only=True``. + + Alternatively, if you want a zero-copy view and know what you are doing, + use `.view()`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def set(self, filter: Series, value: int | float | str) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def set_at_idx(self, idx: Series | np.ndarray[Any, Any] | Sequence[int] | int, value: int | float | str | bool | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | date | datetime | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + idx + Integers representing the index locations. + value + replacement values. + + Returns + ------- + Series + The mutated series. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_at_idx(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Very cheap deepcopy/clone. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always _significantly_ + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If ``return_dtype`` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an ``@lru_cache`` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, periods: int = ...) -> Series: + ''' + Shift the values by a given period. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shift(periods=1) + shape: (3,) + Series: \'a\' [i64] + [ + null + 1 + 2 + ] + >>> s.shift(periods=-1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 3 + null + ] + + Parameters + ---------- + periods + Number of places to shift (may be negative). + + ''' + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift the values by a given period and fill the resulting null values. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + periods + Number of places to shift (may be negative). + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the ``window_size - 1`` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the n-th discrete difference. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least ``n`` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If ``bias`` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + ''' + Clip (limit) the values in an array to a `min` and `max` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Minimum value. + upper_bound + Maximum value. + + Examples + -------- + >>> s = pl.Series("foo", [-50, 5, None, 50]) + >>> s.clip(1, 10) + shape: (4,) + Series: \'foo\' [i64] + [ + 1 + 5 + null + 10 + ] + + ''' + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + ''' + Clip (limit) the values in an array to a `min` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + lower_bound + Lower bound. + + ''' + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + ''' + Clip (limit) the values in an array to a `max` boundary. + + Only works for physical numerical types. + + If you want to clip other dtypes, consider writing a "when, then, otherwise" + expression. See :func:`when` for more information. + + Parameters + ---------- + upper_bound + Upper bound. + + ''' + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def map_dict(self, remapping: dict[Any, Any]) -> Self: + ''' + Replace values in the Series using a remapping dictionary. + + Parameters + ---------- + remapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use ``pl.first()``, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + Examples + -------- + >>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"]) + >>> country_lookup = { + ... "JPN": "Japan", + ... "TUR": "Türkiye", + ... "NLD": "Netherlands", + ... } + + Remap, setting a default for unrecognised values... + + >>> s.map_dict(country_lookup, default="Unspecified").alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "Unspecified" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by making use of ``pl.first()``: + + >>> s.map_dict(country_lookup, default=pl.first()).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + ...or keep the original value, by assigning the input series: + + >>> s.map_dict(country_lookup, default=s).alias("country_name") + shape: (4,) + Series: \'country_name\' [str] + [ + "Türkiye" + "???" + "Japan" + "Netherlands" + ] + + Override return dtype: + + >>> s = pl.Series("int8", [5, 2, 3], dtype=pl.Int8) + >>> s.map_dict({2: 7}, default=pl.first(), return_dtype=pl.Int16) + shape: (3,) + Series: \'int8\' [i16] + [ + 5 + 7 + 3 + ] + + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When ``adjust=True`` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When ``adjust=False`` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When ``bias=False``, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When ``ignore_nulls=False`` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if ``adjust=False``. + + - When ``ignore_nulls=True``, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if ``adjust=True``, + and :math:`1-\\alpha` and :math:`\\alpha` if ``adjust=False``. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_datetime_dtype(dtype: PolarsDataType | None, ndtype: np.datetime64) -> PolarsDataType | None: + """Given polars/numpy datetime dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/dataframe/frame deleted file mode 100644 index 562effd..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/dataframe/frame +++ /dev/null @@ -1,6977 +0,0 @@ -import P -import deltalake -import np as np -import pa as pa -import pd as pd -from _io import BytesIO, TextIOWrapper - -from builtins import PyDataFrame -from pathlib import Path -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes.classes import Boolean as Boolean, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 -from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.col import col as col -from polars.functions.lit import lit as lit -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte -from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors -from polars.slice import PolarsSlice as PolarsSlice -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence - -TYPE_CHECKING: bool -INTEGER_DTYPES: frozenset -N_INFER_DEFAULT: int -_PYARROW_AVAILABLE: bool -_dtype_str_repr: builtin_function_or_method - -class DataFrame: - _accessors: _ClassVar[set] = ... - columns: Incomplete - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: - """Construct Polars DataFrame from FFI PyDataFrame object.""" - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from a dictionary of sequences. - - Parameters - ---------- - data : dict of sequences - Two-dimensional data represented as a dictionary. dict must contain - Sequences. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - - """ - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from a sequence of sequences. - - Parameters - ---------- - data : Sequence of sequences - Two-dimensional data represented as a sequence of sequences. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - orient : {'col', 'row'}, default None - Whether to interpret two-dimensional data as columns or as rows. If None, - the orientation is inferred by matching the columns and data dimensions. If - this does not yield conclusive results, column orientation is used. - infer_schema_length - How many rows to scan to determine the column type. - - """ - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from a numpy ndarray. - - Parameters - ---------- - data : numpy ndarray - Two-dimensional data represented as a numpy ndarray. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - orient : {'col', 'row'}, default None - Whether to interpret two-dimensional data as columns or as rows. If None, - the orientation is inferred by matching the columns and data dimensions. If - this does not yield conclusive results, column orientation is used. - - """ - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from an Arrow table. - - This operation will be zero copy for the most part. Types that are not - supported by Polars may be cast to the closest supported type. - - Parameters - ---------- - data : arrow table, array, or sequence of sequences - Data representing an Arrow Table or Array. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - rechunk : bool, default True - Make sure that all data is in contiguous memory. - - """ - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a Polars DataFrame from a pandas DataFrame. - - Parameters - ---------- - data : pandas DataFrame - Two-dimensional data represented as a pandas DataFrame. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - rechunk : bool, default True - Make sure that all data is in contiguous memory. - nan_to_null : bool, default True - If the data contains NaN values they will be converted to null/None. - include_index : bool, default False - Load any non-default pandas indexes as columns. - - """ - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: - """ - Read a CSV file into a DataFrame. - - Use `pl.read_csv` to dispatch to this method. - - See Also - -------- - polars.io.read_csv - - """ - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: - """ - Read into a DataFrame from a parquet file. - - Use `pl.read_parquet` to dispatch to this method. - - See Also - -------- - polars.io.read_parquet - - """ - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: - """ - Read into a DataFrame from Apache Avro format. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - columns - Columns. - n_rows - Stop reading from Apache Avro file after reading `n_rows`. - - """ - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: - ''' - Read into a DataFrame from Arrow IPC file format. - - See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. - Arrow IPC files are also known as Feather (v2) files. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - columns - Columns to select. Accepts a list of column indices (starting at zero) or a - list of column names. - n_rows - Stop reading from IPC file after reading `n_rows`. - row_count_name - Row count name. - row_count_offset - Row count offset. - rechunk - Make sure that all data is contiguous. - memory_map - Memory map the file - - ''' - @classmethod - def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: - ''' - Read into a DataFrame from Arrow IPC record batch stream format. - - See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - columns - Columns to select. Accepts a list of column indices (starting at zero) or a - list of column names. - n_rows - Stop reading from IPC stream after reading `n_rows`. - row_count_name - Row count name. - row_count_offset - Row count offset. - rechunk - Make sure that all data is contiguous. - - ''' - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: - """ - Read into a DataFrame from a JSON file. - - Use `pl.read_json` to dispatch to this method. - - See Also - -------- - polars.io.read_json - - """ - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: - """ - Read into a DataFrame from a newline delimited JSON file. - - Use `pl.read_ndjson` to dispatch to this method. - - See Also - -------- - polars.io.read_ndjson - - """ - def _replace(self, column: str, new_column: Series) -> Self: - """Replace a column by a new Series (in place).""" - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: - """ - Numpy __array__ interface protocol. - - Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see - https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. - """ - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: - ''' - Convert to a dataframe object implementing the dataframe interchange protocol. - - Parameters - ---------- - nan_as_null - Overwrite null values in the data with `NaN`. - - .. warning:: - This functionality has not been implemented and the parameter will be - removed in a future version. - Setting this to `True` will raise a `NotImplementedError`. - allow_copy - Allow memory to be copied to perform the conversion. If set to `False`, - causes conversions that are not zero-copy to fail. - - Notes - ----- - Details on the Python dataframe interchange protocol: - https://data-apis.org/dataframe-protocol/latest/index.html - - Examples - -------- - Convert a Polars DataFrame to a generic dataframe object and access some - properties. - - >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) - >>> dfi = df.__dataframe__() - >>> dfi.num_rows() - 2 - >>> dfi.get_column(1).dtype - (, 64, \'g\', \'=\') - - ''' - def __dataframe_consortium_standard__(self) -> Any: - """ - Provide entry point to the Consortium DataFrame Standard API. - - This is developed and maintained outside of polars. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. - """ - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: - """Compare a DataFrame with another object.""" - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: - """Compare a DataFrame with another DataFrame.""" - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: - """Compare a DataFrame with a non-DataFrame object.""" - def _div(self, other: Any) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Series]: ... - def __reversed__(self) -> Iterator[Series]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: - """Get item. Does quite a lot. Read the comments.""" - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: - """ - Format output data in HTML for display in Jupyter Notebooks. - - Output rows and columns can be modified by setting the following ENVIRONMENT - variables: - - * POLARS_FMT_MAX_COLS: set the number of columns - * POLARS_FMT_MAX_ROWS: set the number of rows - - """ - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: - ''' - Return the DataFrame as a scalar, or return the element at the given row/column. - - Parameters - ---------- - row - Optional row index. - column - Optional column index or name. - - See Also - -------- - row: Get the values of a single row, either by index or by predicate. - - Notes - ----- - If row/col not provided, this is equivalent to `df[0,0]`, with a check that - the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> df.select((pl.col("a") * pl.col("b")).sum()).item() - 32 - >>> df.item(1, 1) - 5 - >>> df.item(2, "b") - 6 - - ''' - def to_arrow(self) -> pa.Table: - ''' - Collect the underlying arrow arrays in an Arrow Table. - - This operation is mostly zero copy. - - Data types that do copy: - - CategoricalType - - Examples - -------- - >>> df = pl.DataFrame( - ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} - ... ) - >>> df.to_arrow() - pyarrow.Table - foo: int64 - bar: large_string - ---- - foo: [[1,2,3,4,5,6]] - bar: [["a","b","c","d","e","f"]] - - ''' - def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: - ''' - Convert DataFrame to a dictionary mapping column name to values. - - Parameters - ---------- - as_series - True -> Values are Series - False -> Values are List[Any] - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4, 5], - ... "fruits": ["banana", "banana", "apple", "apple", "banana"], - ... "B": [5, 4, 3, 2, 1], - ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], - ... "optional": [28, 300, None, 2, -30], - ... } - ... ) - >>> df - shape: (5, 5) - ┌─────┬────────┬─────┬────────┬──────────┐ - │ A ┆ fruits ┆ B ┆ cars ┆ optional │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ - ╞═════╪════════╪═════╪════════╪══════════╡ - │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ - │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ - │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ - │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ - │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ - └─────┴────────┴─────┴────────┴──────────┘ - >>> df.to_dict(as_series=False) - {\'A\': [1, 2, 3, 4, 5], - \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], - \'B\': [5, 4, 3, 2, 1], - \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], - \'optional\': [28, 300, None, 2, -30]} - >>> df.to_dict(as_series=True) - {\'A\': shape: (5,) - Series: \'A\' [i64] - [ - 1 - 2 - 3 - 4 - 5 - ], \'fruits\': shape: (5,) - Series: \'fruits\' [str] - [ - "banana" - "banana" - "apple" - "apple" - "banana" - ], \'B\': shape: (5,) - Series: \'B\' [i64] - [ - 5 - 4 - 3 - 2 - 1 - ], \'cars\': shape: (5,) - Series: \'cars\' [str] - [ - "beetle" - "audi" - "beetle" - "beetle" - "beetle" - ], \'optional\': shape: (5,) - Series: \'optional\' [i64] - [ - 28 - 300 - null - 2 - -30 - ]} - - ''' - def to_dicts(self) -> list[dict[str, Any]]: - ''' - Convert every row to a dictionary of Python-native values. - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.to_dicts() - [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] - - ''' - def to_numpy(self) -> np.ndarray[Any, Any]: - ''' - Convert DataFrame to a 2D NumPy array. - - This operation clones data. - - Parameters - ---------- - structured - Optionally return a structured array, with field names and - dtypes that correspond to the DataFrame schema. - order - The index order of the returned NumPy array, either C-like or - Fortran-like. In general, using the Fortran-like index order is faster. - However, the C-like order might be more appropriate to use for downstream - applications to prevent cloning data, e.g. when reshaping into a - one-dimensional array. Note that this option only takes effect if - `structured` is set to `False` and the DataFrame dtypes allow for a - global dtype for all columns. - - Notes - ----- - If you\'re attempting to convert Utf8 to an array you\'ll need to install - `pyarrow`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.5, 7.0, 8.5], - ... "ham": ["a", "b", "c"], - ... }, - ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, - ... ) - - Export to a standard 2D numpy array. - - >>> df.to_numpy() - array([[1, 6.5, \'a\'], - [2, 7.0, \'b\'], - [3, 8.5, \'c\']], dtype=object) - - Export to a structured array, which can better-preserve individual - column data, such as name and dtype... - - >>> df.to_numpy(structured=True) - array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], - dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np - >>> df.to_numpy(structured=True).view(np.recarray) - rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], - dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: - ''' - Cast to a pandas DataFrame. - - This requires that :mod:`pandas` and :mod:`pyarrow` are installed. - This operation clones data, unless `use_pyarrow_extension_array=True`. - - Parameters - ---------- - use_pyarrow_extension_array - Use PyArrow backed-extension arrays instead of numpy arrays for each column - of the pandas DataFrame; this allows zero copy operations and preservation - of null values. Subsequent operations on the resulting pandas DataFrame may - trigger conversion to NumPy arrays if that operation is not supported by - pyarrow compute functions. - **kwargs - Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. - - Returns - ------- - :class:`pandas.DataFrame` - - Examples - -------- - >>> import pandas - >>> df1 = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> pandas_df1 = df1.to_pandas() - >>> type(pandas_df1) - - >>> pandas_df1.dtypes - foo int64 - bar int64 - ham object - dtype: object - >>> df2 = pl.DataFrame( - ... { - ... "foo": [1, 2, None], - ... "bar": [6, None, 8], - ... "ham": [None, "b", "c"], - ... } - ... ) - >>> pandas_df2 = df2.to_pandas() - >>> pandas_df2 - foo bar ham - 0 1.0 6.0 None - 1 2.0 NaN b - 2 NaN 8.0 c - >>> pandas_df2.dtypes - foo float64 - bar float64 - ham object - dtype: object - >>> pandas_df2_pa = df2.to_pandas( - ... use_pyarrow_extension_array=True - ... ) # doctest: +SKIP - >>> pandas_df2_pa # doctest: +SKIP - foo bar ham - 0 1 6 - 1 2 b - 2 8 c - >>> pandas_df2_pa.dtypes # doctest: +SKIP - foo int64[pyarrow] - bar int64[pyarrow] - ham large_string[pyarrow] - dtype: object - - ''' - def to_series(self, index: int = ...) -> Series: - ''' - Select column as Series at index location. - - Parameters - ---------- - index - Location of selection. - - See Also - -------- - get_column - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.to_series(1) - shape: (3,) - Series: \'bar\' [i64] - [ - 6 - 7 - 8 - ] - - ''' - def to_init_repr(self, n: int = ...) -> str: - ''' - Convert DataFrame to instantiatable string representation. - - Parameters - ---------- - n - Only use first n rows. - - See Also - -------- - polars.Series.to_init_repr - polars.from_repr - - Examples - -------- - >>> df = pl.DataFrame( - ... [ - ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), - ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), - ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), - ... ] - ... ) - >>> print(df.to_init_repr()) - pl.DataFrame( - [ - pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), - pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), - pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), - ] - ) - - >>> df_from_str_repr = eval(df.to_init_repr()) - >>> df_from_str_repr - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ f32 ┆ cat │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 7.0 ┆ b │ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: - ''' - Serialize to JSON representation. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - If set to `None` (default), the output is returned as a string instead. - pretty - Pretty serialize json. - row_oriented - Write to row oriented json. This is slower, but more common. - - See Also - -------- - DataFrame.write_ndjson - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... } - ... ) - >>> df.write_json() - \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' - >>> df.write_json(row_oriented=True) - \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' - - ''' - def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: - ''' - Serialize to newline delimited JSON representation. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - If set to `None` (default), the output is returned as a string instead. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... } - ... ) - >>> df.write_ndjson() - \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' - - ''' - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: - ''' - Write to comma-separated values (CSV) file. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - If set to `None` (default), the output is returned as a string instead. - include_bom - Whether to include UTF-8 BOM in the CSV output. - include_header - Whether to include header in the CSV output. - separator - Separate CSV fields with this symbol. - line_terminator - String used to end each row. - quote_char - Byte to use as quoting character. - batch_size - Number of rows that will be processed per thread. - datetime_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. If no format specified, the default fractional-second - precision is inferred from the maximum timeunit found in the frame\'s - Datetime cols (if any). - date_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - time_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - float_precision - Number of decimal places to write, applied to both `Float32` and - `Float64` datatypes. - null_value - A string representing null values (defaulting to the empty string). - quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} - Determines the quoting strategy used. - - - necessary (default): This puts quotes around fields only when necessary. - They are necessary when fields contain a quote, - separator or record terminator. - Quotes are also necessary when writing an empty record - (which is indistinguishable from a record with one empty field). - This is the default. - - always: This puts quotes around every field. Always. - - never: This never puts quotes around fields, even if that results in - invalid CSV data (e.g.: by not quoting strings containing the separator). - - non_numeric: This puts quotes around all fields that are non-numeric. - Namely, when writing a field that does not parse as a valid float - or integer, then quotes will be used even if they aren`t strictly - necessary. - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.csv" - >>> df.write_csv(path, separator=",") - - ''' - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: - ''' - Write to Apache Avro file. - - Parameters - ---------- - file - File path or writeable file-like object to which the data will be written. - compression : {\'uncompressed\', \'snappy\', \'deflate\'} - Compression method. Defaults to "uncompressed". - name - Schema name. Defaults to empty string. - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.avro" - >>> df.write_avro(path) - - ''' - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: - ''' - Write frame data to a table in an Excel workbook/worksheet. - - Parameters - ---------- - workbook : Workbook - String name or path of the workbook to create, BytesIO object to write - into, or an open `xlsxwriter.Workbook` object that has not been closed. - If None, writes to a `dataframe.xlsx` workbook in the working directory. - worksheet : str - Name of target worksheet; if None, writes to "Sheet1" when creating a new - workbook (note that writing to an existing workbook requires a valid - existing -or new- worksheet name). - position : {str, tuple} - Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. - table_style : {str, dict} - A named Excel table style, such as "Table Style Medium 4", or a dictionary - of `{"key":value,}` options containing one or more of the following keys: - "style", "first_column", "last_column", "banded_columns, "banded_rows". - table_name : str - Name of the output table object in the worksheet; can then be referred to - in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. - column_formats : dict - A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an - Excel format string to the given columns. Formats defined here (such as - "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. - dtype_formats : dict - A `{dtype:str,}` dictionary that sets the default Excel format for the - given dtype. (This can be overridden on a per-column basis by the - `column_formats` param). It is also valid to use dtype groups such as - `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform - integer and float formats. - conditional_formats : dict - A dictionary of colname (or selector) keys to a format str, dict, or list - that defines conditional formatting options for the specified columns. - - * If supplying a string typename, should be one of the valid `xlsxwriter` - types such as "3_color_scale", "data_bar", etc. - * If supplying a dictionary you can make use of any/all `xlsxwriter` - supported options, including icon sets, formulae, etc. - * Supplying multiple columns as a tuple/key will apply a single format - across all columns - this is effective in creating a heatmap, as the - min/max values will be determined across the entire range, not per-column. - * Finally, you can also supply a list made up from the above options - in order to apply *more* than one conditional format to the same range. - header_format : dict - A `{key:value,}` dictionary of `xlsxwriter` format options to apply - to the table header row, such as `{"bold":True, "font_color":"#702963"}`. - column_totals : {bool, list, dict} - Add a column-total row to the exported table. - - * If True, all numeric columns will have an associated total using "sum". - * If passing a string, it must be one of the valid total function names - and all numeric columns will have an associated total using that function. - * If passing a list of colnames, only those given will have a total. - * For more control, pass a `{colname:funcname,}` dict. - - Valid total function names are "average", "count_nums", "count", "max", - "min", "std_dev", "sum", and "var". - column_widths : {dict, int} - A `{colname:int,}` or `{selector:int,}` dict or a single integer that - sets (or overrides if autofitting) table column widths, in integer pixel - units. If given as an integer the same value is used for all table columns. - row_totals : {dict, bool} - Add a row-total column to the right-hand side of the exported table. - - * If True, a column called "total" will be added at the end of the table - that applies a "sum" function row-wise across all numeric columns. - * If passing a list/sequence of column names, only the matching columns - will participate in the sum. - * Can also pass a `{colname:columns,}` dictionary to create one or - more total columns with distinct names, referencing different columns. - row_heights : {dict, int} - An int or `{row_index:int,}` dictionary that sets the height of the given - rows (if providing a dictionary) or all rows (if providing an integer) that - intersect with the table body (including any header and total row) in - integer pixel units. Note that `row_index` starts at zero and will be - the header row (unless `include_header` is False). - sparklines : dict - A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more - sparklines to be written into a new column in the table. - - * If passing a list of colnames (used as the source of the sparkline data) - the default sparkline settings are used (eg: line chart with no markers). - * For more control an `xlsxwriter`-compliant options dict can be supplied, - in which case three additional polars-specific keys are available: - "columns", "insert_before", and "insert_after". These allow you to define - the source columns and position the sparkline(s) with respect to other - table columns. If no position directive is given, sparklines are added to - the end of the table (eg: to the far right) in the order they are given. - formulas : dict - A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or - more formulas to be written into a new column in the table. Note that you - are strongly advised to use structured references in your formulae wherever - possible to make it simple to reference columns by name. - - * If providing a string formula (such as "=[@colx]*[@coly]") the column will - be added to the end of the table (eg: to the far right), after any default - sparklines and before any row_totals. - * For the most control supply an options dictionary with the following keys: - "formula" (mandatory), one of "insert_before" or "insert_after", and - optionally "return_dtype". The latter is used to appropriately format the - output of the formula and allow it to participate in row/column totals. - float_precision : int - Default number of decimals displayed for floating point columns (note that - this is purely a formatting directive; the actual values are not rounded). - include_header : bool - Indicate if the table should be created with a header row. - autofilter : bool - If the table has headers, provide autofilter capability. - autofit : bool - Calculate individual column widths from the data. - hidden_columns : list - A list or selector representing table columns to hide in the worksheet. - hide_gridlines : bool - Do not display any gridlines on the output worksheet. - sheet_zoom : int - Set the default zoom level of the output worksheet. - freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) - Freeze workbook panes. - - * If (row, col) is supplied, panes are split at the top-left corner of the - specified cell, which are 0-indexed. Thus, to freeze only the top row, - supply (1, 0). - * Alternatively, cell notation can be used to supply the cell. For example, - "A2" indicates the split occurs at the top-left of cell A2, which is the - equivalent of (1, 0). - * If (row, col, top_row, top_col) are supplied, the panes are split based on - the `row` and `col`, and the scrolling region is inititalized to begin at - the `top_row` and `top_col`. Thus, to freeze only the top row and have the - scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). - Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. - - Notes - ----- - * A list of compatible `xlsxwriter` format property names can be found here: - https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties - - * Conditional formatting dictionaries should provide xlsxwriter-compatible - definitions; polars will take care of how they are applied on the worksheet - with respect to the relative sheet/column position. For supported options, - see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html - - * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible - key/values, as well as a mandatory polars "columns" key that defines the - sparkline source data; these source columns should all be adjacent. Two other - polars-specific keys are available to help define where the sparkline appears - in the table: "insert_after", and "insert_before". The value associated with - these keys should be the name of a column in the exported table. - https://xlsxwriter.readthedocs.io/working_with_sparklines.html - - * Formula dictionaries *must* contain a key called "formula", and then optional - "insert_after", "insert_before", and/or "return_dtype" keys. These additional - keys allow the column to be injected into the table at a specific location, - and/or to define the return type of the formula (eg: "Int64", "Float64", etc). - Formulas that refer to table columns should use Excel\'s structured references - syntax to ensure the formula is applied correctly and is table-relative. - https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e - - Examples - -------- - Instantiate a basic DataFrame: - - >>> from random import uniform - >>> from datetime import date - >>> - >>> df = pl.DataFrame( - ... { - ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], - ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], - ... "val": [10_000, 20_000, 30_000], - ... } - ... ) - - Export to "dataframe.xlsx" (the default workbook name, if not specified) in the - working directory, add column totals ("sum" by default) on all numeric columns, - then autofit: - - >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP - - Write frame to a specific location on the sheet, set a named table style, - apply US-style date formatting, increase default float precision, apply a - non-default total function to a single column, autofit: - - >>> df.write_excel( # doctest: +SKIP - ... position="B4", - ... table_style="Table Style Light 16", - ... dtype_formats={pl.Date: "mm/dd/yyyy"}, - ... column_totals={"num": "average"}, - ... float_precision=6, - ... autofit=True, - ... ) - - Write the same frame to a named worksheet twice, applying different styles - and conditional formatting to each table, adding table titles using explicit - xlsxwriter integration: - - >>> from xlsxwriter import Workbook - >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP - ... # basic/default conditional formatting - ... df.write_excel( - ... workbook=wb, - ... worksheet="data", - ... position=(3, 1), # specify position as (row,col) coordinates - ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, - ... table_style="Table Style Medium 4", - ... ) - ... - ... # advanced conditional formatting, custom styles - ... df.write_excel( - ... workbook=wb, - ... worksheet="data", - ... position=(len(df) + 7, 1), - ... table_style={ - ... "style": "Table Style Light 4", - ... "first_column": True, - ... }, - ... conditional_formats={ - ... "num": { - ... "type": "3_color_scale", - ... "min_color": "#76933c", - ... "mid_color": "#c4d79b", - ... "max_color": "#ebf1de", - ... }, - ... "val": { - ... "type": "data_bar", - ... "data_bar_2010": True, - ... "bar_color": "#9bbb59", - ... "bar_negative_color_same": True, - ... "bar_negative_border_color_same": True, - ... }, - ... }, - ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, - ... column_widths={"val": 125}, - ... autofit=True, - ... ) - ... - ... # add some table titles (with a custom format) - ... ws = wb.get_worksheet_by_name("data") - ... fmt_title = wb.add_format( - ... { - ... "font_color": "#4f6228", - ... "font_size": 12, - ... "italic": True, - ... "bold": True, - ... } - ... ) - ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) - ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) - ... - - Export a table containing two different types of sparklines. Use default - options for the "trend" sparkline and customised options (and positioning) - for the "+/-" win_loss sparkline, with non-default integer dtype formatting, - column totals, a subtle two-tone heatmap and hidden worksheet gridlines: - - >>> df = pl.DataFrame( - ... { - ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], - ... "q1": [100, 55, -20, 0, 35], - ... "q2": [30, -10, 15, 60, 20], - ... "q3": [-50, 0, 40, 80, 80], - ... "q4": [75, 55, 25, -10, -55], - ... } - ... ) - >>> df.write_excel( # doctest: +SKIP - ... table_style="Table Style Light 2", - ... # apply accounting format to all flavours of integer - ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, - ... sparklines={ - ... # default options; just provide source cols - ... "trend": ["q1", "q2", "q3", "q4"], - ... # customised sparkline type, with positioning directive - ... "+/-": { - ... "columns": ["q1", "q2", "q3", "q4"], - ... "insert_after": "id", - ... "type": "win_loss", - ... }, - ... }, - ... conditional_formats={ - ... # create a unified multi-column heatmap - ... ("q1", "q2", "q3", "q4"): { - ... "type": "2_color_scale", - ... "min_color": "#95b3d7", - ... "max_color": "#ffffff", - ... }, - ... }, - ... column_totals=["q1", "q2", "q3", "q4"], - ... row_totals=True, - ... hide_gridlines=True, - ... ) - - Export a table containing an Excel formula-based column that calculates a - standardised Z-score, showing use of structured references in conjunction - with positioning directives, column totals, and custom formatting. - - >>> df = pl.DataFrame( - ... { - ... "id": ["a123", "b345", "c567", "d789", "e101"], - ... "points": [99, 45, 50, 85, 35], - ... } - ... ) - >>> df.write_excel( # doctest: +SKIP - ... table_style={ - ... "style": "Table Style Medium 15", - ... "first_column": True, - ... }, - ... column_formats={ - ... "id": {"font": "Consolas"}, - ... "points": {"align": "center"}, - ... "z-score": {"align": "center"}, - ... }, - ... column_totals="average", - ... formulas={ - ... "z-score": { - ... # use structured references to refer to the table columns and \'totals\' row - ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", - ... "insert_after": "points", - ... "return_dtype": pl.Float64, - ... } - ... }, - ... hide_gridlines=True, - ... sheet_zoom=125, - ... ) - - ''' - def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: - ''' - Write to Arrow IPC binary stream or Feather file. - - See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. - - Parameters - ---------- - file - Path or writeable file-like object to which the IPC data will be - written. If set to `None`, the output is returned as a BytesIO object. - compression : {\'uncompressed\', \'lz4\', \'zstd\'} - Compression method. Defaults to "uncompressed". - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.arrow" - >>> df.write_ipc(path) - - ''' - def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: - ''' - Write to Arrow IPC record batch stream. - - See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. - - Parameters - ---------- - file - Path or writeable file-like object to which the IPC record batch data will - be written. If set to `None`, the output is returned as a BytesIO object. - compression : {\'uncompressed\', \'lz4\', \'zstd\'} - Compression method. Defaults to "uncompressed". - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.arrow" - >>> df.write_ipc_stream(path) - - ''' - def write_parquet(self, file: str | Path | BytesIO) -> None: - ''' - Write to Apache Parquet file. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} - Choose "zstd" for good compression performance. - Choose "lz4" for fast compression/decompression. - Choose "snappy" for more backwards compatibility guarantees - when you deal with older parquet readers. - compression_level - The level of compression to use. Higher compression means smaller files on - disk. - - - "gzip" : min-level: 0, max-level: 10. - - "brotli" : min-level: 0, max-level: 11. - - "zstd" : min-level: 1, max-level: 22. - - statistics - Write statistics to the parquet headers. This requires extra compute. - row_group_size - Size of the row groups in number of rows. Defaults to 512^2 rows. - use_pyarrow - Use C++ parquet implementation vs Rust parquet implementation. - At the moment C++ supports more features. - pyarrow_options - Arguments passed to `pyarrow.parquet.write_table`. - - If you pass `partition_cols` here, the dataset will be written - using `pyarrow.parquet.write_to_dataset`. - The `partition_cols` parameter leads to write the dataset to a directory. - Similar to Spark\'s partitioned datasets. - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.parquet" - >>> df.write_parquet(path) - - We can use pyarrow with use_pyarrow_write_to_dataset=True - to write partitioned datasets. The following example will - write the first row to ../watermark=1/*.parquet and the - other rows to ../watermark=2/*.parquet. - - >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) - >>> path: pathlib.Path = dirpath / "partitioned_object" - >>> df.write_parquet( - ... path, - ... use_pyarrow=True, - ... pyarrow_options={"partition_cols": ["watermark"]}, - ... ) - - ''' - def write_database(self, table_name: str, connection: str) -> None: - ''' - Write a polars frame to a database. - - Parameters - ---------- - table_name - Name of the table to create or append to in the target SQL database. - If your table name contains special characters, it should be quoted. - connection - Connection URI string, for example: - - * "postgresql://user:pass@server:port/database" - * "sqlite:////path/to/database.db" - if_exists : {\'append\', \'replace\', \'fail\'} - The insert mode. - \'replace\' will create a new database table, overwriting an existing one. - \'append\' will append to an existing table. - \'fail\' will fail if table already exists. - engine : {\'sqlalchemy\', \'adbc\'} - Select the engine used for writing the data. - ''' - def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: - ''' - Write DataFrame as delta table. - - Parameters - ---------- - target - URI of a table or a DeltaTable object. - mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} - How to handle existing data. - - * If \'error\', throw an error if the table already exists (default). - * If \'append\', will add new data. - * If \'overwrite\', will replace table with new data. - * If \'ignore\', will not write anything if table already exists. - overwrite_schema - If True, allows updating the schema of the table. - storage_options - Extra options for the storage backends supported by `deltalake`. - For cloud storages, this may include configurations for authentication etc. - - * See a list of supported storage options for S3 `here `__. - * See a list of supported storage options for GCS `here `__. - * See a list of supported storage options for Azure `here `__. - delta_write_options - Additional keyword arguments while writing a Delta lake Table. - See a list of supported write options `here `__. - - Raises - ------ - TypeError - If the DataFrame contains unsupported data types. - ArrowInvalidError - If the DataFrame contains data types that could not be cast to their - primitive type. - - Notes - ----- - The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` - are not supported by the delta protocol specification and will raise a - TypeError. - - Some other data types are not supported but have an associated `primitive type - `__ - to which they can be cast. This affects the following data types: - - - Unsigned integers - - :class:`Datetime` types with millisecond or nanosecond precision or with - time zone information - - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) - - Polars columns are always nullable. To write data to a delta table with - non-nullable columns, a custom pyarrow schema has to be passed to the - `delta_write_options`. See the last example below. - - Examples - -------- - Write a dataframe to the local filesystem as a Delta Lake table. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> table_path = "/path/to/delta-table/" - >>> df.write_delta(table_path) # doctest: +SKIP - - Append data to an existing Delta Lake table on the local filesystem. - Note that this will fail if the schema of the new data does not match the - schema of the existing table. - - >>> df.write_delta(table_path, mode="append") # doctest: +SKIP - - Overwrite a Delta Lake table as a new version. - If the schemas of the new and old data are the same, setting - `overwrite_schema` is not required. - - >>> existing_table_path = "/path/to/delta-table/" - >>> df.write_delta( - ... existing_table_path, mode="overwrite", overwrite_schema=True - ... ) # doctest: +SKIP - - Write a dataframe as a Delta Lake table to a cloud object store like S3. - - >>> table_path = "s3://bucket/prefix/to/delta-table/" - >>> df.write_delta( - ... table_path, - ... storage_options={ - ... "AWS_REGION": "THE_AWS_REGION", - ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", - ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", - ... }, - ... ) # doctest: +SKIP - - Write DataFrame as a Delta Lake table with non-nullable columns. - - >>> import pyarrow as pa - >>> existing_table_path = "/path/to/delta-table/" - >>> df.write_delta( - ... existing_table_path, - ... delta_write_options={ - ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) - ... }, - ... ) # doctest: +SKIP - - ''' - def estimated_size(self, unit: SizeUnit = ...) -> int | float: - ''' - Return an estimation of the total (heap) allocated size of the `DataFrame`. - - Estimated size is given in the specified unit (bytes by default). - - This estimation is the sum of the size of its buffers, validity, including - nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the - size of 2 arrays is not the sum of the sizes computed from this function. In - particular, [`StructArray`]\'s size is an upper bound. - - When an array is sliced, its allocated size remains constant because the buffer - unchanged. However, this function will yield a smaller number. This is because - this function returns the visible size of the buffer, not its total capacity. - - FFI buffers are included in this estimation. - - Parameters - ---------- - unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} - Scale the returned size to the given unit. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "x": list(reversed(range(1_000_000))), - ... "y": [v / 1000 for v in range(1_000_000)], - ... "z": [str(v) for v in range(1_000_000)], - ... }, - ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], - ... ) - >>> df.estimated_size() - 25888898 - >>> df.estimated_size("mb") - 24.689577102661133 - - ''' - def transpose(self) -> Self: - ''' - Transpose a DataFrame over the diagonal. - - Parameters - ---------- - include_header - If set, the column names will be added as first column. - header_name - If `include_header` is set, this determines the name of the column that will - be inserted. - column_names - Optional iterable yielding strings or a string naming an existing column. - These will name the value (non-header) columns in the transposed data. - - Notes - ----- - This is a very expensive operation. Perhaps you can do it differently. - - Returns - ------- - DataFrame - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) - >>> df.transpose(include_header=True) - shape: (2, 4) - ┌────────┬──────────┬──────────┬──────────┐ - │ column ┆ column_0 ┆ column_1 ┆ column_2 │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞════════╪══════════╪══════════╪══════════╡ - │ a ┆ 1 ┆ 2 ┆ 3 │ - │ b ┆ 1 ┆ 2 ┆ 3 │ - └────────┴──────────┴──────────┴──────────┘ - - Replace the auto-generated column names with a list - - >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 2 ┆ 3 │ - │ 1 ┆ 2 ┆ 3 │ - └─────┴─────┴─────┘ - - Include the header as a separate column - - >>> df.transpose( - ... include_header=True, header_name="foo", column_names=["a", "b", "c"] - ... ) - shape: (2, 4) - ┌─────┬─────┬─────┬─────┐ - │ foo ┆ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═════╡ - │ a ┆ 1 ┆ 2 ┆ 3 │ - │ b ┆ 1 ┆ 2 ┆ 3 │ - └─────┴─────┴─────┴─────┘ - - Replace the auto-generated column with column names from a generator function - - >>> def name_generator(): - ... base_name = "my_column_" - ... count = 0 - ... while True: - ... yield f"{base_name}{count}" - ... count += 1 - ... - >>> df.transpose(include_header=False, column_names=name_generator()) - shape: (2, 3) - ┌─────────────┬─────────────┬─────────────┐ - │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════════════╪═════════════╪═════════════╡ - │ 1 ┆ 2 ┆ 3 │ - │ 1 ┆ 2 ┆ 3 │ - └─────────────┴─────────────┴─────────────┘ - - Use an existing column as the new column names - - >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) - >>> df.transpose(column_names="id") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 3 ┆ 2 │ - │ 3 ┆ 4 ┆ 6 │ - └─────┴─────┴─────┘ - >>> df.transpose(include_header=True, header_name="new_id", column_names="id") - shape: (2, 4) - ┌────────┬─────┬─────┬─────┐ - │ new_id ┆ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞════════╪═════╪═════╪═════╡ - │ col1 ┆ 1 ┆ 3 ┆ 2 │ - │ col2 ┆ 3 ┆ 4 ┆ 6 │ - └────────┴─────┴─────┴─────┘ - ''' - def reverse(self) -> DataFrame: - ''' - Reverse the DataFrame. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "key": ["a", "b", "c"], - ... "val": [1, 2, 3], - ... } - ... ) - >>> df.reverse() - shape: (3, 2) - ┌─────┬─────┐ - │ key ┆ val │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ c ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 1 │ - └─────┴─────┘ - - ''' - def rename(self, mapping: dict[str, str]) -> DataFrame: - ''' - Rename column names. - - Parameters - ---------- - mapping - Key value pairs that map from old name to new name. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} - ... ) - >>> df.rename({"foo": "apple"}) - shape: (3, 3) - ┌───────┬─────┬─────┐ - │ apple ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═══════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └───────┴─────┴─────┘ - - ''' - def insert_column(self, index: int, column: Series) -> Self: - ''' - Insert a Series at a certain column index. - - This operation is in place. - - Parameters - ---------- - index - Index at which to insert the new `Series` column. - column - `Series` to insert. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> s = pl.Series("baz", [97, 98, 99]) - >>> df.insert_column(1, s) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ baz ┆ bar │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 97 ┆ 4 │ - │ 2 ┆ 98 ┆ 5 │ - │ 3 ┆ 99 ┆ 6 │ - └─────┴─────┴─────┘ - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) - >>> df.insert_column(3, s) - shape: (4, 4) - ┌─────┬──────┬───────┬──────┐ - │ a ┆ b ┆ c ┆ d │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 │ - ╞═════╪══════╪═══════╪══════╡ - │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ - │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ - │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ - │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ - └─────┴──────┴───────┴──────┘ - - ''' - def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: - ''' - Filter the rows in the DataFrame based on a predicate expression. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - predicates - Expression that evaluates to a boolean Series. - constraints - Column filters. Use name=value to filter column name by the supplied value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - - Filter on one condition: - - >>> df.filter(pl.col("foo") > 1) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Filter on multiple conditions, combined with and/or operators: - - >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Provide multiple filters using `*args` syntax: - - >>> df.filter( - ... pl.col("foo") <= 2, - ... ~pl.col("ham").is_in(["b", "c"]), - ... ) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Provide multiple filters using `**kwargs` syntax: - - >>> df.filter(foo=2, ham="b") - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - └─────┴─────┴─────┘ - - ''' - def glimpse(self) -> str | None: - ''' - Return a dense preview of the DataFrame. - - The formatting shows one line per column so that wide dataframes display - cleanly. Each line shows the column name, the data type, and the first - few values. - - Parameters - ---------- - max_items_per_column - Maximum number of items to show per column. - max_colname_length - Maximum length of the displayed column names; values that exceed this - value are truncated with a trailing ellipsis. - return_as_string - If True, return the preview as a string instead of printing to stdout. - - See Also - -------- - describe, head, tail - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... { - ... "a": [1.0, 2.8, 3.0], - ... "b": [4, 5, None], - ... "c": [True, False, True], - ... "d": [None, "b", "c"], - ... "e": ["usd", "eur", None], - ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], - ... } - ... ) - >>> df.glimpse() - Rows: 3 - Columns: 6 - $ a 1.0, 2.8, 3.0 - $ b 4, 5, None - $ c True, False, True - $ d None, \'b\', \'c\' - $ e \'usd\', \'eur\', None - $ f 2020-01-01, 2021-01-02, 2022-01-01 - - ''' - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: - ''' - Summary statistics for a DataFrame. - - Parameters - ---------- - percentiles - One or more percentiles to include in the summary statistics. - All values must be in the range `[0, 1]`. - - Notes - ----- - The median is included by default as the 50% percentile. - - See Also - -------- - glimpse - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... { - ... "a": [1.0, 2.8, 3.0], - ... "b": [4, 5, None], - ... "c": [True, False, True], - ... "d": [None, "b", "c"], - ... "e": ["usd", "eur", None], - ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], - ... } - ... ) - >>> df.describe() - shape: (9, 7) - ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ - │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ - ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ - │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ - │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ - │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ - │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ - │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ - │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ - │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ - │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ - │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ - └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ - - ''' - def get_column_index(self, name: str) -> int: - ''' - Find the index of a column by name. - - Parameters - ---------- - name - Name of the column to find. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} - ... ) - >>> df.get_column_index("ham") - 2 - - ''' - def replace_column(self, index: int, column: Series) -> Self: - ''' - Replace a column at an index location. - - This operation is in place. - - Parameters - ---------- - index - Column index. - column - Series that will replace the column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> s = pl.Series("apple", [10, 20, 30]) - >>> df.replace_column(0, s) - shape: (3, 3) - ┌───────┬─────┬─────┐ - │ apple ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═══════╪═════╪═════╡ - │ 10 ┆ 6 ┆ a │ - │ 20 ┆ 7 ┆ b │ - │ 30 ┆ 8 ┆ c │ - └───────┴─────┴─────┘ - ''' - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: - ''' - Sort the dataframe by the given columns. - - Parameters - ---------- - by - Column(s) to sort by. Accepts expression input. Strings are parsed as column - names. - *more_by - Additional columns to sort by, specified as positional arguments. - descending - Sort in descending order. When sorting by multiple columns, can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - - Examples - -------- - Pass a single column name to sort by that column. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [6.0, 5.0, 4.0], - ... "c": ["a", "c", "b"], - ... } - ... ) - >>> df.sort("a") - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - Sorting by expressions is also supported. - - >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - └──────┴─────┴─────┘ - - Sort by multiple columns by passing a list of columns. - - >>> df.sort(["c", "a"], descending=True) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - └──────┴─────┴─────┘ - - Or use positional arguments to sort by multiple columns in the same way. - - >>> df.sort("c", "a", descending=[False, True]) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - ''' - def top_k(self, k: int) -> DataFrame: - ''' - Return the `k` largest elements. - - If \'descending=True` the smallest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - See Also - -------- - bottom_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 largest values in column b. - - >>> df.top_k(4, by="b") - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ a ┆ 2 │ - │ b ┆ 2 │ - │ b ┆ 1 │ - └─────┴─────┘ - - Get the rows which contain the 4 largest values when sorting on column b and a. - - >>> df.top_k(4, by=["b", "a"]) - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 2 │ - │ c ┆ 1 │ - └─────┴─────┘ - - ''' - def bottom_k(self, k: int) -> DataFrame: - ''' - Return the `k` smallest elements. - - If \'descending=True` the largest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - See Also - -------- - top_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 smallest values in column b. - - >>> df.bottom_k(4, by="b") - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 1 │ - │ a ┆ 1 │ - │ c ┆ 1 │ - │ a ┆ 2 │ - └─────┴─────┘ - - Get the rows which contain the 4 smallest values when sorting on column a and b. - - >>> df.bottom_k(4, by=["a", "b"]) - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ b ┆ 1 │ - │ b ┆ 2 │ - └─────┴─────┘ - - ''' - def equals(self, other: DataFrame) -> bool: - ''' - Check whether the DataFrame is equal to another DataFrame. - - Parameters - ---------- - other - DataFrame to compare with. - null_equal - Consider null values as equal. - - See Also - -------- - assert_frame_equal - - Examples - -------- - >>> df1 = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df2 = pl.DataFrame( - ... { - ... "foo": [3, 2, 1], - ... "bar": [8.0, 7.0, 6.0], - ... "ham": ["c", "b", "a"], - ... } - ... ) - >>> df1.equals(df1) - True - >>> df1.equals(df2) - False - - ''' - def replace(self, column: str, new_column: Series) -> Self: - ''' - Replace a column by a new Series. - - Parameters - ---------- - column - Column to replace. - new_column - New column to insert. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> s = pl.Series([10, 20, 30]) - >>> df.replace("foo", s) # works in-place! # doctest: +SKIP - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 10 ┆ 4 │ - │ 20 ┆ 5 │ - │ 30 ┆ 6 │ - └─────┴─────┘ - - ''' - def slice(self, offset: int, length: int | None = ...) -> Self: - ''' - Get a slice of this DataFrame. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.slice(1, 2) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7.0 ┆ b │ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def head(self, n: int = ...) -> Self: - ''' - Get the first `n` rows. - - Parameters - ---------- - n - Number of rows to return. If a negative value is passed, return all rows - except the last `abs(n)`. - - See Also - -------- - tail, glimpse, slice - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> df.head(3) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Pass a negative value to get all rows `except` the last `abs(n)`. - - >>> df.head(-3) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - └─────┴─────┴─────┘ - - ''' - def tail(self, n: int = ...) -> Self: - ''' - Get the last `n` rows. - - Parameters - ---------- - n - Number of rows to return. If a negative value is passed, return all rows - except the first `abs(n)`. - - See Also - -------- - head, slice - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> df.tail(3) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8 ┆ c │ - │ 4 ┆ 9 ┆ d │ - │ 5 ┆ 10 ┆ e │ - └─────┴─────┴─────┘ - - Pass a negative value to get all rows `except` the first `abs(n)`. - - >>> df.tail(-3) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 4 ┆ 9 ┆ d │ - │ 5 ┆ 10 ┆ e │ - └─────┴─────┴─────┘ - - ''' - def limit(self, n: int = ...) -> Self: - """ - Get the first `n` rows. - - Alias for :func:`DataFrame.head`. - - Parameters - ---------- - n - Number of rows to return. If a negative value is passed, return all rows - except the last `abs(n)`. - - See Also - -------- - head - - """ - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: - ''' - Drop all rows that contain null values. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - subset - Column name(s) for which null values are considered. - If set to `None` (default), use all columns. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, None, 8], - ... "ham": ["a", "b", None], - ... } - ... ) - - The default behavior of this method is to drop rows where any single - value of the row is null. - - >>> df.drop_nulls() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - This behaviour can be constrained to consider only a subset of columns, as - defined by name or with a selector. For example, dropping rows if there is - a null in any of the integer columns: - - >>> import polars.selectors as cs - >>> df.drop_nulls(subset=cs.integer()) - shape: (2, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ null │ - └─────┴─────┴──────┘ - - Below are some additional examples that show how to drop null - values based on other conditions. - - >>> df = pl.DataFrame( - ... { - ... "a": [None, None, None, None], - ... "b": [1, 2, None, 1], - ... "c": [1, None, None, 1], - ... } - ... ) - >>> df - shape: (4, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪══════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ null ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴──────┴──────┘ - - Drop a row only if all values are null: - - >>> df.filter(~pl.all_horizontal(pl.all().is_null())) - shape: (3, 3) - ┌──────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪═════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴─────┴──────┘ - - Drop a column if all values are null: - - >>> df[[s.name for s in df if not (s.null_count() == df.height)]] - shape: (4, 2) - ┌──────┬──────┐ - │ b ┆ c │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ 1 ┆ 1 │ - │ 2 ┆ null │ - │ null ┆ null │ - │ 1 ┆ 1 │ - └──────┴──────┘ - - ''' - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: - ''' - Offers a structured way to apply a sequence of user-defined functions (UDFs). - - Parameters - ---------- - function - Callable; will receive the frame as the first parameter, - followed by any given args/kwargs. - *args - Arguments to pass to the UDF. - **kwargs - Keyword arguments to pass to the UDF. - - Notes - ----- - It is recommended to use LazyFrame when piping operations, in order - to fully take advantage of query optimization and parallelization. - See :meth:`df.lazy() `. - - Examples - -------- - >>> def cast_str_to_int(data, col_name): - ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) - ... - >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) - >>> df.pipe(cast_str_to_int, col_name="b") - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 10 │ - │ 2 ┆ 20 │ - │ 3 ┆ 30 │ - │ 4 ┆ 40 │ - └─────┴─────┘ - - >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) - >>> df - shape: (2, 2) - ┌─────┬─────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - └─────┴─────┘ - >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 1 │ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: - ''' - Add a column at index 0 that counts the rows. - - Parameters - ---------- - name - Name of the column to add. - offset - Start the row count at this offset. Default = 0 - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> df.with_row_count() - shape: (3, 3) - ┌────────┬─────┬─────┐ - │ row_nr ┆ a ┆ b │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ i64 ┆ i64 │ - ╞════════╪═════╪═════╡ - │ 0 ┆ 1 ┆ 2 │ - │ 1 ┆ 3 ┆ 4 │ - │ 2 ┆ 5 ┆ 6 │ - └────────┴─────┴─────┘ - - ''' - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: - ''' - Start a group by operation. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - .. note:: - Within each group, the order of rows is always preserved, regardless - of this argument. - - Returns - ------- - GroupBy - Object which can be used to perform aggregations. - - Examples - -------- - Group by one column and call `agg` to compute the grouped sum of another - column. - - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "c"], - ... "b": [1, 2, 1, 3, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 2 │ - │ b ┆ 5 │ - │ c ┆ 3 │ - └─────┴─────┘ - - Set `maintain_order=True` to ensure the order of the groups is consistent with - the input. - - >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) - shape: (3, 2) - ┌─────┬───────────┐ - │ a ┆ c │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════╪═══════════╡ - │ a ┆ [5, 3] │ - │ b ┆ [4, 2] │ - │ c ┆ [1] │ - └─────┴───────────┘ - - Group by multiple columns by passing a list of column names. - - >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - - Or use positional arguments to group by multiple columns in the same way. - Expressions are also accepted. - - >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ f64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 0 ┆ 4.0 │ - │ b ┆ 1 ┆ 3.0 │ - │ c ┆ 1 ┆ 1.0 │ - └─────┴─────┴─────┘ - - The `GroupBy` object returned by this method is iterable, returning the name - and data of each group. - - >>> for name, data in df.group_by("a"): # doctest: +SKIP - ... print(name) - ... print(data) - ... - a - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘ - b - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘ - c - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - - ''' - def rolling(self, index_column: IntoExpr) -> RollingGroupBy: - ''' - Create rolling groups based on a time, Int32, or Int64 column. - - Different from a `group_by_dynamic` the windows are now determined by the - individual values and are not of constant intervals. For constant intervals use - :func:`DataFrame.group_by_dynamic`. - - If you have a time series ``, then by default the - windows created will be - - * (t_0 - period, t_0] - * (t_1 - period, t_1] - * ... - * (t_n - period, t_n] - - whereas if you pass a non-default `offset`, then the windows will be - - * (t_0 + offset, t_0 + offset + period] - * (t_1 + offset, t_1 + offset + period] - * ... - * (t_n + offset, t_n + offset + period] - - The `period` and `offset` arguments are created either from a timedelta, or - by using the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a rolling operation on an integer column, the windows are defined by: - - - **"1i" # length 1** - - **"10i" # length 10** - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling operation on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - RollingGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - group_by_dynamic - - Examples - -------- - >>> dates = [ - ... "2020-01-01 13:45:48", - ... "2020-01-01 16:42:13", - ... "2020-01-01 16:45:09", - ... "2020-01-02 18:12:48", - ... "2020-01-03 19:45:32", - ... "2020-01-08 23:16:43", - ... ] - >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( - ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() - ... ) - >>> out = df.rolling(index_column="dt", period="2d").agg( - ... [ - ... pl.sum("a").alias("sum_a"), - ... pl.min("a").alias("min_a"), - ... pl.max("a").alias("max_a"), - ... ] - ... ) - >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] - >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] - >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] - >>> out - shape: (6, 4) - ┌─────────────────────┬───────┬───────┬───────┐ - │ dt ┆ sum_a ┆ min_a ┆ max_a │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞═════════════════════╪═══════╪═══════╪═══════╡ - │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ - │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ - │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ - │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ - │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ - │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ - └─────────────────────┴───────┴───────┴───────┘ - - ''' - def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - Time windows are calculated and rows are assigned to windows. Different from a - normal group by is that a row can be member of multiple groups. - By default, the windows look like: - - - [start, start + period) - - [start + every, start + every + period) - - [start + 2*every, start + 2*every + period) - - ... - - where `start` is determined by `start_by`, `offset`, and `every` (see parameter - descriptions below). - - .. warning:: - The index column must be sorted in ascending order. If `by` is passed, then - the index column must be sorted in ascending order within each group. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - - .. deprecated:: 0.19.4 - Use `label` instead. - include_boundaries - Add the lower and upper bound of the window to the "_lower_boundary" and - "_upper_boundary" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - label : {\'left\', \'right\', \'datapoint\'} - Define which label to use for the window: - - - \'left\': lower boundary of the window - - \'right\': upper boundary of the window - - \'datapoint\': the first value of the index column in the given window. - If you don\'t need the label to be at one of the boundaries, choose this - option for maximum performance - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - DynamicGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - rolling - - Notes - ----- - 1) If you\'re coming from pandas, then - - .. code-block:: python - - # polars - df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) - - is equivalent to - - .. code-block:: python - - # pandas - df.set_index("ts").resample("D")["value"].sum().reset_index() - - though note that, unlike pandas, polars doesn\'t add extra rows for empty - windows. If you need `index_column` to be evenly spaced, then please combine - with :func:`DataFrame.upsample`. - - 2) The `every`, `period` and `offset` arguments are created with - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a group_by_dynamic on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Examples - -------- - >>> from datetime import datetime - >>> df = pl.DataFrame( - ... { - ... "time": pl.datetime_range( - ... start=datetime(2021, 12, 16), - ... end=datetime(2021, 12, 16, 3), - ... interval="30m", - ... eager=True, - ... ), - ... "n": range(7), - ... } - ... ) - >>> df - shape: (7, 2) - ┌─────────────────────┬─────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i64 │ - ╞═════════════════════╪═════╡ - │ 2021-12-16 00:00:00 ┆ 0 │ - │ 2021-12-16 00:30:00 ┆ 1 │ - │ 2021-12-16 01:00:00 ┆ 2 │ - │ 2021-12-16 01:30:00 ┆ 3 │ - │ 2021-12-16 02:00:00 ┆ 4 │ - │ 2021-12-16 02:30:00 ┆ 5 │ - │ 2021-12-16 03:00:00 ┆ 6 │ - └─────────────────────┴─────┘ - - Group by windows of 1 hour starting at 2021-12-16 00:00:00. - - >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [1, 2] │ - │ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ 2021-12-16 02:00:00 ┆ [5, 6] │ - └─────────────────────┴───────────┘ - - The window boundaries can also be added to the aggregation result - - >>> df.group_by_dynamic( - ... "time", every="1h", include_boundaries=True, closed="right" - ... ).agg(pl.col("n").mean()) - shape: (4, 4) - ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ - │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ - ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ - │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ - │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ - │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ - │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ - └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ - - When closed="left", the window excludes the right end of interval: - [lower_bound, upper_bound) - - >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-16 00:00:00 ┆ [0, 1] │ - │ 2021-12-16 01:00:00 ┆ [2, 3] │ - │ 2021-12-16 02:00:00 ┆ [4, 5] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - When closed="both" the time values at the window boundaries belong to 2 groups. - - >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) - shape: (5, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ - │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - Dynamic group bys can also be combined with grouping on normal keys - - >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) - >>> df - shape: (7, 3) - ┌─────────────────────┬─────┬────────┐ - │ time ┆ n ┆ groups │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ str │ - ╞═════════════════════╪═════╪════════╡ - │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ - │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ - │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ - │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ - │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ - │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ - │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ - └─────────────────────┴─────┴────────┘ - >>> df.group_by_dynamic( - ... "time", - ... every="1h", - ... closed="both", - ... by="groups", - ... include_boundaries=True, - ... ).agg(pl.col("n")) - shape: (7, 5) - ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ - │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ - ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ - │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ - │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ - │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ - │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ - │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ - └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ - - Dynamic group by on an index column - - >>> df = pl.DataFrame( - ... { - ... "idx": pl.int_range(0, 6, eager=True), - ... "A": ["A", "A", "B", "B", "B", "C"], - ... } - ... ) - >>> ( - ... df.group_by_dynamic( - ... "idx", - ... every="2i", - ... period="3i", - ... include_boundaries=True, - ... closed="right", - ... ).agg(pl.col("A").alias("A_agg_list")) - ... ) - shape: (4, 4) - ┌─────────────────┬─────────────────┬─────┬─────────────────┐ - │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 ┆ list[str] │ - ╞═════════════════╪═════════════════╪═════╪═════════════════╡ - │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ - │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ - │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ - │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ - └─────────────────┴─────────────────┴─────┴─────────────────┘ - - ''' - def upsample(self, time_column: str) -> Self: - ''' - Upsample a DataFrame at a regular frequency. - - The `every` and `offset` arguments are created with - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - - - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - Parameters - ---------- - time_column - time column will be used to determine a date_range. - Note that this column has to be sorted for the output to make sense. - every - interval will start \'every\' duration - offset - change the start of the date_range by this offset. - by - First group by these columns and then upsample for every group - maintain_order - Keep the ordering predictable. This is slower. - - Returns - ------- - DataFrame - Result will be sorted by `time_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - Examples - -------- - Upsample a DataFrame by a certain interval. - - >>> from datetime import datetime - >>> df = pl.DataFrame( - ... { - ... "time": [ - ... datetime(2021, 2, 1), - ... datetime(2021, 4, 1), - ... datetime(2021, 5, 1), - ... datetime(2021, 6, 1), - ... ], - ... "groups": ["A", "B", "A", "B"], - ... "values": [0, 1, 2, 3], - ... } - ... ).set_sorted("time") - >>> df.upsample( - ... time_column="time", every="1mo", by="groups", maintain_order=True - ... ).select(pl.all().forward_fill()) - shape: (7, 3) - ┌─────────────────────┬────────┬────────┐ - │ time ┆ groups ┆ values │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ str ┆ i64 │ - ╞═════════════════════╪════════╪════════╡ - │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ - │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ - │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ - │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ - │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ - │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ - │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ - └─────────────────────┴────────┴────────┘ - - ''' - def join_asof(self, other: DataFrame) -> DataFrame: - ''' - Perform an asof join. - - This is similar to a left-join except that we match on nearest key rather than - equal keys. - - Both DataFrames must be sorted by the asof_join key. - - For each row in the left DataFrame: - - - A "backward" search selects the last row in the right DataFrame whose - \'on\' key is less than or equal to the left\'s key. - - - A "forward" search selects the first row in the right DataFrame whose - \'on\' key is greater than or equal to the left\'s key. - - - A "nearest" search selects the last row in the right DataFrame whose value - is nearest to the left\'s key. String keys are not currently supported for a - nearest search. - - The default is "backward". - - Parameters - ---------- - other - Lazy DataFrame to join with. - left_on - Join column of the left DataFrame. - right_on - Join column of the right DataFrame. - on - Join column of both DataFrames. If set, `left_on` and `right_on` should be - None. - by - join on these columns before doing asof join - by_left - join on these columns before doing asof join - by_right - join on these columns before doing asof join - strategy : {\'backward\', \'forward\', \'nearest\'} - Join strategy. - suffix - Suffix to append to columns with a duplicate name. - tolerance - Numeric tolerance. By setting this the join will only be done if the near - keys are within this distance. If an asof join is done on columns of dtype - "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta - object or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - allow_parallel - Allow the physical plan to optionally evaluate the computation of both - DataFrames up to the join in parallel. - force_parallel - Force the physical plan to evaluate the computation of both DataFrames up to - the join in parallel. - - Examples - -------- - >>> from datetime import datetime - >>> gdp = pl.DataFrame( - ... { - ... "date": [ - ... datetime(2016, 1, 1), - ... datetime(2017, 1, 1), - ... datetime(2018, 1, 1), - ... datetime(2019, 1, 1), - ... ], # note record date: Jan 1st (sorted!) - ... "gdp": [4164, 4411, 4566, 4696], - ... } - ... ).set_sorted("date") - >>> population = pl.DataFrame( - ... { - ... "date": [ - ... datetime(2016, 5, 12), - ... datetime(2017, 5, 12), - ... datetime(2018, 5, 12), - ... datetime(2019, 5, 12), - ... ], # note record date: May 12th (sorted!) - ... "population": [82.19, 82.66, 83.12, 83.52], - ... } - ... ).set_sorted("date") - >>> population.join_asof(gdp, on="date", strategy="backward") - shape: (4, 3) - ┌─────────────────────┬────────────┬──────┐ - │ date ┆ population ┆ gdp │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ f64 ┆ i64 │ - ╞═════════════════════╪════════════╪══════╡ - │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ - │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ - │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ - │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ - └─────────────────────┴────────────┴──────┘ - - ''' - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: - ''' - Join in SQL-like fashion. - - Parameters - ---------- - other - DataFrame to join with. - on - Name(s) of the join columns in both DataFrames. - how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} - Join strategy. - - .. note:: - A left join preserves the row order of the left DataFrame. - left_on - Name(s) of the left join column(s). - right_on - Name(s) of the right join column(s). - suffix - Suffix to append to columns with a duplicate name. - validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} - Checks if join is of specified type. - - * *many_to_many* - “m:m”: default, does not result in checks - * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets - * *one_to_many* - “1:m”: check if join keys are unique in left dataset - * *many_to_one* - “m:1”: check if join keys are unique in right dataset - - .. note:: - - - This is currently not supported the streaming engine. - - This is only supported when joined by single columns. - - Returns - ------- - DataFrame - - See Also - -------- - join_asof - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> other_df = pl.DataFrame( - ... { - ... "apple": ["x", "y", "z"], - ... "ham": ["a", "b", "d"], - ... } - ... ) - >>> df.join(other_df, on="ham") - shape: (2, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - └─────┴─────┴─────┴───────┘ - - >>> df.join(other_df, on="ham", how="outer") - shape: (4, 4) - ┌──────┬──────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞══════╪══════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ null ┆ null ┆ d ┆ z │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └──────┴──────┴─────┴───────┘ - - >>> df.join(other_df, on="ham", how="left") - shape: (3, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └─────┴─────┴─────┴───────┘ - - >>> df.join(other_df, on="ham", how="semi") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 7.0 ┆ b │ - └─────┴─────┴─────┘ - - >>> df.join(other_df, on="ham", how="anti") - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - Notes - ----- - For joining on columns with categorical data, see `pl.StringCache()`. - - ''' - def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: - ''' - Apply a custom/user-defined function (UDF) over the rows of the DataFrame. - - .. warning:: - This method is much slower than the native expressions API. - Only use it if you cannot implement your logic otherwise. - - The UDF will receive each row as a tuple of values: `udf(row)`. - - Implementing logic using a Python function is almost always *significantly* - slower and more memory intensive than implementing the same logic using - the native expression API because: - - - The native expression engine runs in Rust; UDFs run in Python. - - Use of Python UDFs forces the DataFrame to be materialized in memory. - - Polars-native expressions can be parallelised (UDFs typically cannot). - - Polars-native expressions can be logically optimised (UDFs cannot). - - Wherever possible you should strongly prefer the native expression API - to achieve the best performance. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output type of the operation. If none given, Polars tries to infer the type. - inference_size - Only used in the case when the custom function returns rows. - This uses the first `n` rows to determine the output schema. - - Notes - ----- - * The frame-level `apply` cannot track column names (as the UDF is a black-box - that may arbitrarily drop, rearrange, transform, or add new columns); if you - want to apply a UDF such that column names are preserved, you should use the - expression-level `apply` syntax instead. - - * If your function is expensive and you don\'t want it to be called more than - once for a given input, consider applying an `@lru_cache` decorator to it. - If your data is suitable you may achieve *significant* speedups. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) - - Return a DataFrame by mapping each row to a tuple: - - >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) - shape: (3, 2) - ┌──────────┬──────────┐ - │ column_0 ┆ column_1 │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════════╪══════════╡ - │ 2 ┆ -3 │ - │ 4 ┆ 15 │ - │ 6 ┆ 24 │ - └──────────┴──────────┘ - - However, it is much better to implement this with a native expression: - - >>> df.select( - ... pl.col("foo") * 2, - ... pl.col("bar") * 3, - ... ) # doctest: +IGNORE_RESULT - - Return a DataFrame with a single column by mapping each row to a scalar: - - >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP - shape: (3, 1) - ┌───────┐ - │ apply │ - │ --- │ - │ i64 │ - ╞═══════╡ - │ 1 │ - │ 9 │ - │ 14 │ - └───────┘ - - In this case it is better to use the following native expression: - - >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT - - ''' - def hstack(self, columns: list[Series] | DataFrame) -> Self: - ''' - Return a new DataFrame grown horizontally by stacking multiple Series to it. - - Parameters - ---------- - columns - Series to stack. - in_place - Modify in place. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> x = pl.Series("apple", [10, 20, 30]) - >>> df.hstack([x]) - shape: (3, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6 ┆ a ┆ 10 │ - │ 2 ┆ 7 ┆ b ┆ 20 │ - │ 3 ┆ 8 ┆ c ┆ 30 │ - └─────┴─────┴─────┴───────┘ - - ''' - def vstack(self, other: DataFrame) -> Self: - ''' - Grow this DataFrame vertically by stacking a DataFrame to it. - - Parameters - ---------- - other - DataFrame to stack. - in_place - Modify in place. - - See Also - -------- - extend - - Examples - -------- - >>> df1 = pl.DataFrame( - ... { - ... "foo": [1, 2], - ... "bar": [6, 7], - ... "ham": ["a", "b"], - ... } - ... ) - >>> df2 = pl.DataFrame( - ... { - ... "foo": [3, 4], - ... "bar": [8, 9], - ... "ham": ["c", "d"], - ... } - ... ) - >>> df1.vstack(df2) - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - │ 4 ┆ 9 ┆ d │ - └─────┴─────┴─────┘ - - ''' - def extend(self, other: DataFrame) -> Self: - ''' - Extend the memory backed by this `DataFrame` with the values from `other`. - - Different from `vstack` which adds the chunks from `other` to the chunks of - this `DataFrame`, `extend` appends the data from `other` to the underlying - memory locations and thus may cause a reallocation. - - If this does not cause a reallocation, the resulting data structure will not - have any extra chunks and thus will yield faster queries. - - Prefer `extend` over `vstack` when you want to do a query after a single - append. For instance, during online operations where you add `n` rows and rerun - a query. - - Prefer `vstack` over `extend` when you want to append many times before - doing a query. For instance, when you read in multiple files and want to store - them in a single `DataFrame`. In the latter case, finish the sequence of - `vstack` operations with a `rechunk`. - - Parameters - ---------- - other - DataFrame to vertically add. - - Warnings - -------- - This method modifies the dataframe in-place. The dataframe is returned for - convenience only. - - See Also - -------- - vstack - - Examples - -------- - >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) - >>> df1.extend(df2) - shape: (6, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 5 │ - │ 3 ┆ 6 │ - │ 10 ┆ 40 │ - │ 20 ┆ 50 │ - │ 30 ┆ 60 │ - └─────┴─────┘ - - ''' - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: - ''' - Remove columns from the dataframe. - - Parameters - ---------- - columns - Names of the columns that should be removed from the dataframe, or - a selector that determines the columns to drop. - *more_columns - Additional columns to drop, specified as positional arguments. - - Examples - -------- - Drop a single column by passing the name of that column. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.drop("ham") - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪═════╡ - │ 1 ┆ 6.0 │ - │ 2 ┆ 7.0 │ - │ 3 ┆ 8.0 │ - └─────┴─────┘ - - Drop multiple columns by passing a list of column names. - - >>> df.drop(["bar", "ham"]) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - Drop multiple columns by passing a selector. - - >>> import polars.selectors as cs - >>> df.drop(cs.numeric()) - shape: (3, 1) - ┌─────┐ - │ ham │ - │ --- │ - │ str │ - ╞═════╡ - │ a │ - │ b │ - │ c │ - └─────┘ - - Use positional arguments to drop multiple columns. - - >>> df.drop("foo", "ham") - shape: (3, 1) - ┌─────┐ - │ bar │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 6.0 │ - │ 7.0 │ - │ 8.0 │ - └─────┘ - - ''' - def drop_in_place(self, name: str) -> Series: - ''' - Drop a single column in-place and return the dropped column. - - Parameters - ---------- - name - Name of the column to drop. - - Returns - ------- - Series - The dropped column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.drop_in_place("ham") - shape: (3,) - Series: \'ham\' [str] - [ - "a" - "b" - "c" - ] - - ''' - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: - ''' - Cast DataFrame column(s) to the specified dtype(s). - - Parameters - ---------- - dtypes - Mapping of column names (or selector) to dtypes, or a single dtype - to which all columns will be cast. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], - ... } - ... ) - - Cast specific frame columns to the specified dtypes: - - >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ u8 ┆ date │ - ╞═════╪═════╪════════════╡ - │ 1.0 ┆ 6 ┆ 2020-01-02 │ - │ 2.0 ┆ 7 ┆ 2021-03-04 │ - │ 3.0 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - Cast all frame columns to the specified dtype: - - >>> df.cast(pl.Utf8).to_dict(as_series=False) - {\'foo\': [\'1\', \'2\', \'3\'], - \'bar\': [\'6.0\', \'7.0\', \'8.0\'], - \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} - - Use selectors to define the columns being cast: - - >>> import polars.selectors as cs - >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ str │ - ╞═════╪═════╪════════════╡ - │ 1 ┆ 6 ┆ 2020-01-02 │ - │ 2 ┆ 7 ┆ 2021-03-04 │ - │ 3 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - ''' - def clear(self, n: int = ...) -> Self: - ''' - Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. - - Returns a `n`-row null-filled DataFrame with an identical schema. - `n` can be greater than the current number of rows in the DataFrame. - - Parameters - ---------- - n - Number of (null-filled) rows to return in the cleared frame. - - See Also - -------- - clone : Cheap deepcopy/clone. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> df.clear() - shape: (0, 3) - ┌─────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞═════╪═════╪══════╡ - └─────┴─────┴──────┘ - - >>> df.clear(n=2) - shape: (2, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪══════╪══════╡ - │ null ┆ null ┆ null │ - │ null ┆ null ┆ null │ - └──────┴──────┴──────┘ - - ''' - def clone(self) -> Self: - ''' - Create a copy of this DataFrame. - - This is a cheap operation that does not copy data. - - See Also - -------- - clear : Create an empty copy of the current DataFrame, with identical - schema but no data. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.clone() - shape: (4, 3) - ┌─────┬──────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true │ - │ 2 ┆ 4.0 ┆ true │ - │ 3 ┆ 10.0 ┆ false │ - │ 4 ┆ 13.0 ┆ true │ - └─────┴──────┴───────┘ - - ''' - def get_columns(self) -> list[Series]: - ''' - Get the DataFrame as a List of Series. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.get_columns() - [shape: (3,) - Series: \'foo\' [i64] - [ - 1 - 2 - 3 - ], shape: (3,) - Series: \'bar\' [i64] - [ - 4 - 5 - 6 - ]] - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.get_columns() - [shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - ], shape: (4,) - Series: \'b\' [f64] - [ - 0.5 - 4.0 - 10.0 - 13.0 - ], shape: (4,) - Series: \'c\' [bool] - [ - true - true - false - true - ]] - - ''' - def get_column(self, name: str) -> Series: - ''' - Get a single column by name. - - Parameters - ---------- - name : str - Name of the column to retrieve. - - Returns - ------- - Series - - See Also - -------- - to_series - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.get_column("foo") - shape: (3,) - Series: \'foo\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: - ''' - Fill null values using the specified value or strategy. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - matches_supertype - Fill all matching supertype of the fill `value`. - - Returns - ------- - DataFrame - DataFrame with None values replaced by the filling strategy. - - See Also - -------- - fill_nan - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 4], - ... "b": [0.5, 4, None, 13], - ... } - ... ) - >>> df.fill_null(99) - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 99 ┆ 99.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - >>> df.fill_null(strategy="forward") - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> df.fill_null(strategy="max") - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> df.fill_null(strategy="zero") - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 0 ┆ 0.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - ''' - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: - ''' - Fill floating point NaN values by an Expression evaluation. - - Parameters - ---------- - value - Value with which to replace NaN values. - - Returns - ------- - DataFrame - DataFrame with NaN values replaced by the given value. - - Warnings - -------- - Note that floating point NaNs (Not a Number) are not missing values! - To replace missing values, use :func:`fill_null`. - - See Also - -------- - fill_null - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1.5, 2, float("nan"), 4], - ... "b": [0.5, 4, float("nan"), 13], - ... } - ... ) - >>> df.fill_nan(99) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪══════╡ - │ 1.5 ┆ 0.5 │ - │ 2.0 ┆ 4.0 │ - │ 99.0 ┆ 99.0 │ - │ 4.0 ┆ 13.0 │ - └──────┴──────┘ - - ''' - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: - ''' - Explode the dataframe to long format by exploding the given columns. - - Parameters - ---------- - columns - Column names, expressions, or a selector defining them. The underlying - columns being exploded must be of List or Utf8 datatype. - *more_columns - Additional names of columns to explode, specified as positional arguments. - - Returns - ------- - DataFrame - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "letters": ["a", "a", "b", "c"], - ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], - ... } - ... ) - >>> df - shape: (4, 2) - ┌─────────┬───────────┐ - │ letters ┆ numbers │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════════╪═══════════╡ - │ a ┆ [1] │ - │ a ┆ [2, 3] │ - │ b ┆ [4, 5] │ - │ c ┆ [6, 7, 8] │ - └─────────┴───────────┘ - >>> df.explode("numbers") - shape: (8, 2) - ┌─────────┬─────────┐ - │ letters ┆ numbers │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════════╪═════════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ a ┆ 3 │ - │ b ┆ 4 │ - │ b ┆ 5 │ - │ c ┆ 6 │ - │ c ┆ 7 │ - │ c ┆ 8 │ - └─────────┴─────────┘ - - ''' - def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: - ''' - Create a spreadsheet-style pivot table as a DataFrame. - - Only available in eager mode. See "Examples" section below for how to do a - "lazy pivot" if you know the unique column values in advance. - - Parameters - ---------- - values - Column values to aggregate. Can be multiple columns if the *columns* - arguments contains multiple columns as well. - index - One or multiple keys to group by. - columns - Name of the column(s) whose values will be used as the header of the output - DataFrame. - aggregate_function - Choose from: - - - None: no aggregation takes place, will raise error if multiple values are in group. - - A predefined aggregate function string, one of - {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} - - An expression to do the aggregation. - - maintain_order - Sort the grouped keys so that the output order is predictable. - sort_columns - Sort the transposed columns by name. Default is by order of discovery. - separator - Used as separator/delimiter in generated column names. - - Returns - ------- - DataFrame - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": ["one", "one", "two", "two", "one", "two"], - ... "bar": ["y", "y", "y", "x", "x", "x"], - ... "baz": [1, 2, 3, 4, 5, 6], - ... } - ... ) - >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ y ┆ x │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ one ┆ 3 ┆ 5 │ - │ two ┆ 3 ┆ 10 │ - └─────┴─────┴─────┘ - - Pivot using selectors to determine the index/values/columns: - - >>> import polars.selectors as cs - >>> df.pivot( - ... values=cs.numeric(), - ... index=cs.string(), - ... columns=cs.string(), - ... aggregate_function="sum", - ... sort_columns=True, - ... ).sort( - ... by=cs.string(), - ... ) - shape: (4, 6) - ┌─────┬─────┬──────┬──────┬──────┬──────┐ - │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪══════╪══════╪══════╪══════╡ - │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ - │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ - │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ - │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ - └─────┴─────┴──────┴──────┴──────┴──────┘ - - Run an expression as aggregation function - - >>> df = pl.DataFrame( - ... { - ... "col1": ["a", "a", "a", "b", "b", "b"], - ... "col2": ["x", "x", "x", "x", "y", "y"], - ... "col3": [6, 7, 3, 2, 5, 7], - ... } - ... ) - >>> df.pivot( - ... index="col1", - ... columns="col2", - ... values="col3", - ... aggregate_function=pl.element().tanh().mean(), - ... ) - shape: (2, 3) - ┌──────┬──────────┬──────────┐ - │ col1 ┆ x ┆ y │ - │ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 │ - ╞══════╪══════════╪══════════╡ - │ a ┆ 0.998347 ┆ null │ - │ b ┆ 0.964028 ┆ 0.999954 │ - └──────┴──────────┴──────────┘ - - Note that `pivot` is only available in eager mode. If you know the unique - column values in advance, you can use :meth:`polars.LazyFrame.groupby` to - get the same result as above in lazy mode: - - >>> index = pl.col("col1") - >>> columns = pl.col("col2") - >>> values = pl.col("col3") - >>> unique_column_values = ["x", "y"] - >>> aggregate_function = lambda col: col.tanh().mean() - >>> ( - ... df.lazy() - ... .group_by(index) - ... .agg( - ... *[ - ... aggregate_function(values.filter(columns == value)).alias(value) - ... for value in unique_column_values - ... ] - ... ) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - shape: (2, 3) - ┌──────┬──────────┬──────────┐ - │ col1 ┆ x ┆ y │ - │ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 │ - ╞══════╪══════════╪══════════╡ - │ a ┆ 0.998347 ┆ null │ - │ b ┆ 0.964028 ┆ 0.999954 │ - └──────┴──────────┴──────────┘ - - ''' - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: - ''' - Unpivot a DataFrame from wide to long format. - - Optionally leaves identifiers set. - - This function is useful to massage a DataFrame into a format where one or more - columns are identifier variables (id_vars) while all other columns, considered - measured variables (value_vars), are "unpivoted" to the row axis leaving just - two non-identifier columns, \'variable\' and \'value\'. - - Parameters - ---------- - id_vars - Column(s) or selector(s) to use as identifier variables. - value_vars - Column(s) or selector(s) to use as values variables; if `value_vars` - is empty all columns that are not in `id_vars` will be used. - variable_name - Name to give to the `variable` column. Defaults to "variable" - value_name - Name to give to the `value` column. Defaults to "value" - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["x", "y", "z"], - ... "b": [1, 3, 5], - ... "c": [2, 4, 6], - ... } - ... ) - >>> import polars.selectors as cs - >>> df.melt(id_vars="a", value_vars=cs.numeric()) - shape: (6, 3) - ┌─────┬──────────┬───────┐ - │ a ┆ variable ┆ value │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 │ - ╞═════╪══════════╪═══════╡ - │ x ┆ b ┆ 1 │ - │ y ┆ b ┆ 3 │ - │ z ┆ b ┆ 5 │ - │ x ┆ c ┆ 2 │ - │ y ┆ c ┆ 4 │ - │ z ┆ c ┆ 6 │ - └─────┴──────────┴───────┘ - - ''' - def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: - ''' - Unstack a long table to a wide form without doing an aggregation. - - This can be much faster than a pivot, because it can skip the grouping phase. - - Warnings - -------- - This functionality is experimental and may be subject to changes - without it being considered a breaking change. - - Parameters - ---------- - step - Number of rows in the unstacked frame. - how : { \'vertical\', \'horizontal\' } - Direction of the unstack. - columns - Column name(s) or selector(s) to include in the operation. - If set to `None` (default), use all columns. - fill_values - Fill values that don\'t fit the new size with this value. - - Examples - -------- - >>> from string import ascii_uppercase - >>> df = pl.DataFrame( - ... { - ... "x": list(ascii_uppercase[0:8]), - ... "y": pl.int_range(1, 9, eager=True), - ... } - ... ).with_columns( - ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), - ... ) - >>> df - shape: (8, 3) - ┌─────┬─────┬──────────┐ - │ x ┆ y ┆ z │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ list[u8] │ - ╞═════╪═════╪══════════╡ - │ A ┆ 1 ┆ [1, 2] │ - │ B ┆ 2 ┆ [2, 3] │ - │ C ┆ 3 ┆ [3, 4] │ - │ D ┆ 4 ┆ [4, 5] │ - │ E ┆ 5 ┆ [5, 6] │ - │ F ┆ 6 ┆ [6, 7] │ - │ G ┆ 7 ┆ [7, 8] │ - │ H ┆ 8 ┆ [8, 9] │ - └─────┴─────┴──────────┘ - >>> df.unstack(step=4, how="vertical") - shape: (4, 6) - ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ - │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ - ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ - │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ - │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ - │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ - │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ - └─────┴─────┴─────┴─────┴──────────┴──────────┘ - >>> df.unstack(step=2, how="horizontal") - shape: (4, 6) - ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ - │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ - ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ - │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ - │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ - │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ - │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ - └─────┴─────┴─────┴─────┴──────────┴──────────┘ - >>> import polars.selectors as cs - >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) - shape: (5, 2) - ┌─────┬─────┐ - │ y_0 ┆ y_1 │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 6 │ - │ 2 ┆ 7 │ - │ 3 ┆ 8 │ - │ 4 ┆ 0 │ - │ 5 ┆ 0 │ - └─────┴─────┘ - - ''' - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: - ''' - Group by the given columns and return the groups as separate dataframes. - - Parameters - ---------- - by - Column name(s) or selector(s) to group by. - *more_by - Additional names of columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default partition by operation. - include_key - Include the columns used to partition the DataFrame in the output. - as_dict - Return a dictionary instead of a list. The dictionary keys are the distinct - group values that identify that group. - - Examples - -------- - Pass a single column name to partition by that column. - - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "c"], - ... "b": [1, 2, 1, 3, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> df.partition_by("a") # doctest: +IGNORE_RESULT - [shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘, - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘] - - Partition by multiple columns by either passing a list of column names, or by - specifying each column name as a positional argument. - - >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT - [shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘] - - Return the partitions as a dictionary by specifying `as_dict=True`. - - >>> import polars.selectors as cs - >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT - {\'a\': shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘, - \'b\': shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘, - \'c\': shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘} - - ''' - def shift(self, n: int = ...) -> DataFrame: - ''' - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. Accepts expression input. - Non-expression inputs are parsed as literals. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [5, 6, 7, 8], - ... } - ... ) - >>> df.shift() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ null ┆ null │ - │ 1 ┆ 5 │ - │ 2 ┆ 6 │ - │ 3 ┆ 7 │ - └──────┴──────┘ - - Pass a negative value to shift in the opposite direction instead. - - >>> df.shift(-2) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ null ┆ null │ - │ null ┆ null │ - └──────┴──────┘ - - Specify `fill_value` to fill the resulting null values. - - >>> df.shift(-2, fill_value=100) - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ 100 ┆ 100 │ - │ 100 ┆ 100 │ - └─────┴─────┘ - - ''' - def is_duplicated(self) -> Series: - ''' - Get a mask of all duplicated rows in this DataFrame. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) - >>> df.is_duplicated() - shape: (4,) - Series: \'\' [bool] - [ - true - false - false - true - ] - - This mask can be used to visualize the duplicated lines like this: - - >>> df.filter(df.is_duplicated()) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ str │ - ╞═════╪═════╡ - │ 1 ┆ x │ - │ 1 ┆ x │ - └─────┴─────┘ - ''' - def is_unique(self) -> Series: - ''' - Get a mask of all unique rows in this DataFrame. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) - >>> df.is_unique() - shape: (4,) - Series: \'\' [bool] - [ - false - true - true - false - ] - - This mask can be used to visualize the unique lines like this: - - >>> df.filter(df.is_unique()) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ str │ - ╞═════╪═════╡ - │ 2 ┆ y │ - │ 3 ┆ z │ - └─────┴─────┘ - ''' - def lazy(self) -> LazyFrame: - ''' - Start a lazy query from this point. This returns a `LazyFrame` object. - - Operations on a `LazyFrame` are not executed until this is requested by either - calling: - - * :meth:`.fetch() ` - (run on a small number of rows) - * :meth:`.collect() ` - (run on all data) - * :meth:`.describe_plan() ` - (print unoptimized query plan) - * :meth:`.describe_optimized_plan() ` - (print optimized query plan) - * :meth:`.show_graph() ` - (show (un)optimized query plan as graphviz graph) - - Lazy operations are advised because they allow for query optimization and more - parallelization. - - Returns - ------- - LazyFrame - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> df.lazy() # doctest: +ELLIPSIS - - - ''' - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - ''' - Select columns from this DataFrame. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Examples - -------- - Pass the name of a column to select that column. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.select("foo") - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - Multiple columns can be selected by passing a list of column names. - - >>> df.select(["foo", "bar"]) - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 6 │ - │ 2 ┆ 7 │ - │ 3 ┆ 8 │ - └─────┴─────┘ - - Multiple columns can also be selected using positional arguments instead of a - list. Expressions are also accepted. - - >>> df.select(pl.col("foo"), pl.col("bar") + 1) - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - └─────┴─────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) - shape: (3, 1) - ┌───────────┐ - │ threshold │ - │ --- │ - │ i32 │ - ╞═══════════╡ - │ 0 │ - │ 0 │ - │ 10 │ - └───────────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... df.select( - ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), - ... ) - ... - shape: (3, 1) - ┌───────────┐ - │ is_odd │ - │ --- │ - │ struct[2] │ - ╞═══════════╡ - │ {1,0} │ - │ {0,1} │ - │ {1,0} │ - └───────────┘ - - ''' - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - """ - Select columns from this LazyFrame. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - See Also - -------- - select - - """ - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - ''' - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - DataFrame - A new DataFrame with the columns added. - - Notes - ----- - Creating a new DataFrame using this method does not create a new copy of - existing data. - - Examples - -------- - Pass an expression to add it as a new column. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) - shape: (4, 4) - ┌─────┬──────┬───────┬──────┐ - │ a ┆ b ┆ c ┆ a^2 │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 │ - ╞═════╪══════╪═══════╪══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ - └─────┴──────┴───────┴──────┘ - - Added columns will replace existing columns with the same name. - - >>> df.with_columns(pl.col("a").cast(pl.Float64)) - shape: (4, 3) - ┌─────┬──────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╡ - │ 1.0 ┆ 0.5 ┆ true │ - │ 2.0 ┆ 4.0 ┆ true │ - │ 3.0 ┆ 10.0 ┆ false │ - │ 4.0 ┆ 13.0 ┆ true │ - └─────┴──────┴───────┘ - - Multiple columns can be added by passing a list of expressions. - - >>> df.with_columns( - ... [ - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ] - ... ) - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Multiple columns also can be added using positional arguments instead of a list. - - >>> df.with_columns( - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ) - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> df.with_columns( - ... ab=pl.col("a") * pl.col("b"), - ... not_c=pl.col("c").not_(), - ... ) - shape: (4, 5) - ┌─────┬──────┬───────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ ab ┆ not_c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ - └─────┴──────┴───────┴──────┴───────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... df.drop("c").with_columns( - ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), - ... ) - ... - shape: (4, 3) - ┌─────┬──────┬─────────────┐ - │ a ┆ b ┆ diffs │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ struct[2] │ - ╞═════╪══════╪═════════════╡ - │ 1 ┆ 0.5 ┆ {null,null} │ - │ 2 ┆ 4.0 ┆ {1,3.5} │ - │ 3 ┆ 10.0 ┆ {1,6.0} │ - │ 4 ┆ 13.0 ┆ {1,3.0} │ - └─────┴──────┴─────────────┘ - - ''' - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - """ - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - LazyFrame - A new LazyFrame with the columns added. - - See Also - -------- - with_columns - - """ - def n_chunks(self, strategy: str = ...) -> int | list[int]: - ''' - Get number of chunks used by the ChunkedArrays of this DataFrame. - - Parameters - ---------- - strategy : {\'first\', \'all\'} - Return the number of chunks of the \'first\' column, - or \'all\' columns in this DataFrame. - - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.n_chunks() - 1 - >>> df.n_chunks(strategy="all") - [1, 1, 1] - - ''' - def max(self, axis: int | None = ...) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their maximum value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`max_horizontal`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.max() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def max_horizontal(self) -> Series: - ''' - Get the maximum value horizontally across columns. - - Returns - ------- - Series - A Series named `"max"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.max_horizontal() - shape: (3,) - Series: \'max\' [f64] - [ - 4.0 - 5.0 - 6.0 - ] - ''' - def min(self, axis: int | None = ...) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their minimum value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`min_horizontal`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.min() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - ''' - def min_horizontal(self) -> Series: - ''' - Get the minimum value horizontally across columns. - - Returns - ------- - Series - A Series named `"min"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.min_horizontal() - shape: (3,) - Series: \'min\' [f64] - [ - 1.0 - 2.0 - 3.0 - ] - ''' - def sum(self) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their sum value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`sum_horizontal`. - null_strategy : {\'ignore\', \'propagate\'} - This argument is only used if `axis == 1`. - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.sum() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 6 ┆ 21 ┆ null │ - └─────┴─────┴──────┘ - ''' - def sum_horizontal(self) -> Series: - ''' - Sum all values horizontally across columns. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - If set to `False`, any null value in the input will lead to a null output. - - Returns - ------- - Series - A Series named `"sum"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.sum_horizontal() - shape: (3,) - Series: \'sum\' [f64] - [ - 5.0 - 7.0 - 9.0 - ] - ''' - def mean(self) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their mean value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`mean_horizontal`. - null_strategy : {\'ignore\', \'propagate\'} - This argument is only used if `axis == 1`. - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... "spam": [True, False, None], - ... } - ... ) - >>> df.mean() - shape: (1, 4) - ┌─────┬─────┬──────┬──────┐ - │ foo ┆ bar ┆ ham ┆ spam │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str ┆ f64 │ - ╞═════╪═════╪══════╪══════╡ - │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ - └─────┴─────┴──────┴──────┘ - ''' - def mean_horizontal(self) -> Series: - ''' - Take the mean of all values horizontally across columns. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - If set to `False`, any null value in the input will lead to a null output. - - Returns - ------- - Series - A Series named `"mean"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.mean_horizontal() - shape: (3,) - Series: \'mean\' [f64] - [ - 2.5 - 3.5 - 4.5 - ] - ''' - def std(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns of this DataFrame to their standard deviation value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.std() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1.0 ┆ 1.0 ┆ null │ - └─────┴─────┴──────┘ - >>> df.std(ddof=0) - shape: (1, 3) - ┌──────────┬──────────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞══════════╪══════════╪══════╡ - │ 0.816497 ┆ 0.816497 ┆ null │ - └──────────┴──────────┴──────┘ - - ''' - def var(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns of this DataFrame to their variance value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.var() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1.0 ┆ 1.0 ┆ null │ - └─────┴─────┴──────┘ - >>> df.var(ddof=0) - shape: (1, 3) - ┌──────────┬──────────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞══════════╪══════════╪══════╡ - │ 0.666667 ┆ 0.666667 ┆ null │ - └──────────┴──────────┴──────┘ - - ''' - def median(self) -> Self: - ''' - Aggregate the columns of this DataFrame to their median value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.median() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 2.0 ┆ 7.0 ┆ null │ - └─────┴─────┴──────┘ - - ''' - def product(self) -> DataFrame: - ''' - Aggregate the columns of this DataFrame to their product values. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": [0.5, 4, 10], - ... "c": [True, True, False], - ... } - ... ) - - >>> df.product() - shape: (1, 3) - ┌─────┬──────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ i64 │ - ╞═════╪══════╪═════╡ - │ 6 ┆ 20.0 ┆ 0 │ - └─────┴──────┴─────┘ - - ''' - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: - ''' - Aggregate the columns of this DataFrame to their quantile value. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.quantile(0.5, "nearest") - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 2.0 ┆ 7.0 ┆ null │ - └─────┴─────┴──────┘ - - ''' - def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: - ''' - Convert categorical variables into dummy/indicator variables. - - Parameters - ---------- - columns - Column name(s) or selector(s) that should be converted to dummy - variables. If set to `None` (default), convert all columns. - separator - Separator/delimiter used when generating column names. - drop_first - Remove the first category from the variables being encoded. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2], - ... "bar": [3, 4], - ... "ham": ["a", "b"], - ... } - ... ) - >>> df.to_dummies() - shape: (2, 6) - ┌───────┬───────┬───────┬───────┬───────┬───────┐ - │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ - ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ - │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ - │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ - └───────┴───────┴───────┴───────┴───────┴───────┘ - - >>> df.to_dummies(drop_first=True) - shape: (2, 3) - ┌───────┬───────┬───────┐ - │ foo_2 ┆ bar_4 ┆ ham_b │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 │ - ╞═══════╪═══════╪═══════╡ - │ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1 ┆ 1 │ - └───────┴───────┴───────┘ - - >>> import polars.selectors as cs - >>> df.to_dummies(cs.integer(), separator=":") - shape: (2, 5) - ┌───────┬───────┬───────┬───────┬─────┐ - │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ - ╞═══════╪═══════╪═══════╪═══════╪═════╡ - │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ - │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ - └───────┴───────┴───────┴───────┴─────┘ - - >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") - shape: (2, 3) - ┌───────┬───────┬─────┐ - │ foo:2 ┆ bar:4 ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ str │ - ╞═══════╪═══════╪═════╡ - │ 0 ┆ 0 ┆ a │ - │ 1 ┆ 1 ┆ b │ - └───────┴───────┴─────┘ - - ''' - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: - ''' - Drop duplicate rows from this dataframe. - - Parameters - ---------- - subset - Column name(s) or selector(s), to consider when identifying - duplicate rows. If set to `None` (default), use all columns. - keep : {\'first\', \'last\', \'any\', \'none\'} - Which of the duplicate rows to keep. - - * \'any\': Does not give any guarantee of which row is kept. - This allows more optimizations. - * \'none\': Don\'t keep duplicate rows. - * \'first\': Keep first unique row. - * \'last\': Keep last unique row. - maintain_order - Keep the same order as the original DataFrame. This is more expensive to - compute. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - Returns - ------- - DataFrame - DataFrame with unique rows. - - Warnings - -------- - This method will fail if there is a column of type `List` in the DataFrame or - subset. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 1], - ... "bar": ["a", "a", "a", "a"], - ... "ham": ["b", "b", "b", "b"], - ... } - ... ) - >>> df.unique(maintain_order=True) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> df.unique(subset=["bar", "ham"], maintain_order=True) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> df.unique(keep="last", maintain_order=True) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - - ''' - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: - ''' - Return the number of unique rows, or the number of unique row-subsets. - - Parameters - ---------- - subset - One or more columns/expressions that define what to count; - omit to return the count of unique rows. - - Notes - ----- - This method operates at the `DataFrame` level; to operate on subsets at the - expression level you can make use of struct-packing instead, for example: - - >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() - - If instead you want to count the number of unique values per-column, you can - also use expression-level syntax to return a new frame containing that result: - - >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) - >>> df_nunique = df.select(pl.all().n_unique()) - - In aggregate context there is also an equivalent method for returning the - unique values per-group: - - >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 1, 2, 3, 4, 5], - ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], - ... "c": [True, True, True, False, True, True], - ... } - ... ) - >>> df.n_unique() - 5 - - Simple columns subset. - - >>> df.n_unique(subset=["b", "c"]) - 4 - - Expression subset. - - >>> df.n_unique( - ... subset=[ - ... (pl.col("a") // 2), - ... (pl.col("c") | (pl.col("b") >= 2)), - ... ], - ... ) - 3 - - ''' - def approx_n_unique(self) -> DataFrame: - ''' - Approximate count of unique values. - - This is done using the HyperLogLog++ algorithm for cardinality estimation. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> df.approx_n_unique() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def approx_unique(self) -> DataFrame: - """ - Approximate count of unique values. - - .. deprecated:: 0.18.12 - This method has been renamed to :func:`DataFrame.approx_n_unique`. - - """ - def rechunk(self) -> Self: - """ - Rechunk the data in this DataFrame to a contiguous allocation. - - This will make sure all subsequent operations have optimal and predictable - performance. - """ - def null_count(self) -> Self: - ''' - Create a new DataFrame that shows the null counts per column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, None, 3], - ... "bar": [6, 7, None], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.null_count() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ u32 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 1 ┆ 0 │ - └─────┴─────┴─────┘ - - ''' - def sample(self, n: int | Series | None = ...) -> Self: - ''' - Sample from this DataFrame. - - Parameters - ---------- - n - Number of items to return. Cannot be used with `fraction`. Defaults to 1 if - `fraction` is None. - fraction - Fraction of items to return. Cannot be used with `n`. - with_replacement - Allow values to be sampled more than once. - shuffle - If set to True, the order of the sampled rows will be shuffled. If - set to False (default), the order of the returned rows will be - neither stable nor fully random. - seed - Seed for the random number generator. If set to None (default), a - random seed is generated for each sample operation. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8 ┆ c │ - │ 2 ┆ 7 ┆ b │ - └─────┴─────┴─────┘ - - ''' - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: - ''' - Apply a horizontal reduction on a DataFrame. - - This can be used to effectively determine aggregations on a row level, and can - be applied to any DataType that can be supercasted (casted to a similar parent - type). - - An example of the supercast rules when applying an arithmetic operation on two - DataTypes are for instance: - - - Int8 + Utf8 = Utf8 - - Float32 + Int64 = Float32 - - Float32 + Float64 = Float64 - - Examples - -------- - A horizontal sum operation: - - >>> df = pl.DataFrame( - ... { - ... "a": [2, 1, 3], - ... "b": [1, 2, 3], - ... "c": [1.0, 2.0, 3.0], - ... } - ... ) - >>> df.fold(lambda s1, s2: s1 + s2) - shape: (3,) - Series: \'a\' [f64] - [ - 4.0 - 5.0 - 9.0 - ] - - A horizontal minimum operation: - - >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) - >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) - shape: (3,) - Series: \'a\' [f64] - [ - 1.0 - 1.0 - 3.0 - ] - - A horizontal string concatenation: - - >>> df = pl.DataFrame( - ... { - ... "a": ["foo", "bar", 2], - ... "b": [1, 2, 3], - ... "c": [1.0, 2.0, 3.0], - ... } - ... ) - >>> df.fold(lambda s1, s2: s1 + s2) - shape: (3,) - Series: \'a\' [str] - [ - "foo11.0" - "bar22.0" - null - ] - - A horizontal boolean or, similar to a row-wise .any(): - - >>> df = pl.DataFrame( - ... { - ... "a": [False, False, True], - ... "b": [False, True, False], - ... } - ... ) - >>> df.fold(lambda s1, s2: s1 | s2) - shape: (3,) - Series: \'a\' [bool] - [ - false - true - true - ] - - Parameters - ---------- - operation - function that takes two `Series` and returns a `Series`. - - ''' - def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: - ''' - Get the values of a single row, either by index or by predicate. - - Parameters - ---------- - index - Row index. - by_predicate - Select the row according to a given expression/predicate. - named - Return a dictionary instead of a tuple. The dictionary is a mapping of - column name to row value. This is more expensive than returning a regular - tuple, but allows for accessing values by column name. - - Returns - ------- - tuple (default) or dictionary of row values - - Notes - ----- - The `index` and `by_predicate` params are mutually exclusive. Additionally, - to ensure clarity, the `by_predicate` parameter must be supplied by keyword. - - When using `by_predicate` it is an error condition if anything other than - one row is returned; more than one row raises `TooManyRowsReturnedError`, and - zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). - - Warnings - -------- - You should NEVER use this method to iterate over a DataFrame; if you require - row-iteration you should strongly prefer use of `iter_rows()` instead. - - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - rows : Materialise all frame data as a list of rows (potentially expensive). - item: Return dataframe element as a scalar. - - Examples - -------- - Specify an index to return the row at the given index as a tuple. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.row(2) - (3, 8, \'c\') - - Specify `named=True` to get a dictionary instead with a mapping of column - names to row values. - - >>> df.row(2, named=True) - {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} - - Use `by_predicate` to return the row that matches the given predicate. - - >>> df.row(by_predicate=(pl.col("ham") == "b")) - (2, 7, \'b\') - - ''' - def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: - ''' - Returns all data in the DataFrame as a list of rows of python-native values. - - Parameters - ---------- - named - Return dictionaries instead of tuples. The dictionaries are a mapping of - column name to row value. This is more expensive than returning a regular - tuple, but allows for accessing values by column name. - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - Warnings - -------- - Row-iteration is not optimal as the underlying data is stored in columnar form; - where possible, prefer export via one of the dedicated export/output methods. - Where possible you should also consider using `iter_rows` instead to avoid - materialising all the data at once. - - Returns - ------- - list of tuples (default) or dictionaries of row values - - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - rows_by_key : Materialises frame data as a key-indexed dictionary. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "x": ["a", "b", "b", "a"], - ... "y": [1, 2, 3, 4], - ... "z": [0, 3, 6, 9], - ... } - ... ) - >>> df.rows() - [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] - >>> df.rows(named=True) - [{\'x\': \'a\', \'y\': 1, \'z\': 0}, - {\'x\': \'b\', \'y\': 2, \'z\': 3}, - {\'x\': \'b\', \'y\': 3, \'z\': 6}, - {\'x\': \'a\', \'y\': 4, \'z\': 9}] - - ''' - def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: - ''' - Returns DataFrame data as a keyed dictionary of python-native values. - - Note that this method should not be used in place of native operations, due to - the high cost of materialising all frame data out into a dictionary; it should - be used only when you need to move the values out into a Python data structure - or other object that cannot operate directly with Polars/Arrow. - - Parameters - ---------- - key - The column(s) to use as the key for the returned dictionary. If multiple - columns are specified, the key will be a tuple of those values, otherwise - it will be a string. - named - Return dictionary rows instead of tuples, mapping column name to row value. - include_key - Include key values inline with the associated data (by default the key - values are omitted as a memory/performance optimisation, as they can be - reoconstructed from the key). - unique - Indicate that the key is unique; this will result in a 1:1 mapping from - key to a single associated row. Note that if the key is *not* actually - unique the last row with the given key will be returned. - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - See Also - -------- - rows : Materialise all frame data as a list of rows (potentially expensive). - iter_rows : Row iterator over frame data (does not materialise all rows). - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "w": ["a", "b", "b", "a"], - ... "x": ["q", "q", "q", "k"], - ... "y": [1.0, 2.5, 3.0, 4.5], - ... "z": [9, 8, 7, 6], - ... } - ... ) - - Group rows by the given key column(s): - - >>> df.rows_by_key(key=["w"]) - defaultdict(, - {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], - \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) - - Return the same row groupings as dictionaries: - - >>> df.rows_by_key(key=["w"], named=True) - defaultdict(, - {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, - {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], - \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, - {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) - - Return row groupings, assuming keys are unique: - - >>> df.rows_by_key(key=["z"], unique=True) - {9: (\'a\', \'q\', 1.0), - 8: (\'b\', \'q\', 2.5), - 7: (\'b\', \'q\', 3.0), - 6: (\'a\', \'k\', 4.5)} - - Return row groupings as dictionaries, assuming keys are unique: - - >>> df.rows_by_key(key=["z"], named=True, unique=True) - {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, - 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, - 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, - 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} - - Return dictionary rows grouped by a compound key, including key values: - - >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) - defaultdict(, - {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], - (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, - {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], - (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) - - ''' - def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: - ''' - Returns an iterator over the DataFrame of rows of python-native values. - - Parameters - ---------- - named - Return dictionaries instead of tuples. The dictionaries are a mapping of - column name to row value. This is more expensive than returning a regular - tuple, but allows for accessing values by column name. - buffer_size - Determines the number of rows that are buffered internally while iterating - over the data; you should only modify this in very specific cases where the - default value is determined not to be a good fit to your access pattern, as - the speedup from using the buffer is significant (~2-4x). Setting this - value to zero disables row buffering (not recommended). - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - Warnings - -------- - Row iteration is not optimal as the underlying data is stored in columnar form; - where possible, prefer export via one of the dedicated export/output methods - that deals with columnar data. - - Returns - ------- - iterator of tuples (default) or dictionaries (if named) of python row values - - See Also - -------- - rows : Materialises all frame data as a list of rows (potentially expensive). - rows_by_key : Materialises frame data as a key-indexed dictionary. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> [row[0] for row in df.iter_rows()] - [1, 3, 5] - >>> [row["b"] for row in df.iter_rows(named=True)] - [2, 4, 6] - - ''' - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: - ''' - Returns a non-copying iterator of slices over the underlying DataFrame. - - Parameters - ---------- - n_rows - Determines the number of rows contained in each DataFrame slice. - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... data={ - ... "a": range(17_500), - ... "b": date(2023, 1, 1), - ... "c": "klmnoopqrstuvwxyz", - ... }, - ... schema_overrides={"a": pl.Int32}, - ... ) - >>> for idx, frame in enumerate(df.iter_slices()): - ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") - ... - DataFrame:[0]:10000 - DataFrame:[1]:7500 - - Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and - any supported frame export/conversion types; for example, as RecordBatches: - - >>> for frame in df.iter_slices(n_rows=15_000): - ... record_batch = frame.to_arrow().to_batches()[0] - ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") - ... - a: int32 - b: date32[day] - c: large_string - << 15000 - a: int32 - b: date32[day] - c: large_string - << 2500 - - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - partition_by : Split into multiple DataFrames, partitioned by groups. - - ''' - def shrink_to_fit(self) -> Self: - """ - Shrink DataFrame memory usage. - - Shrinks to fit the exact capacity needed to hold the data. - - """ - def gather_every(self, n: int) -> DataFrame: - ''' - Take every nth row in the DataFrame and return as a new DataFrame. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) - >>> s.gather_every(2) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 5 │ - │ 3 ┆ 7 │ - └─────┴─────┘ - - ''' - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: - ''' - Hash and combine the rows in this DataFrame. - - The hash value is of type `UInt64`. - - Parameters - ---------- - seed - Random seed parameter. Defaults to 0. - seed_1 - Random seed parameter. Defaults to `seed` if not set. - seed_2 - Random seed parameter. Defaults to `seed` if not set. - seed_3 - Random seed parameter. Defaults to `seed` if not set. - - Notes - ----- - This implementation of :func:`hash_rows` does not guarantee stable results - across different Polars versions. Its stability is only guaranteed within a - single version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, None, 3, 4], - ... "ham": ["a", "b", None, "d"], - ... } - ... ) - >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT - shape: (4,) - Series: \'\' [u64] - [ - 10783150408545073287 - 1438741209321515184 - 10047419486152048166 - 2047317070637311557 - ] - - ''' - def interpolate(self) -> DataFrame: - ''' - Interpolate intermediate values. The interpolation method is linear. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, None, 9, 10], - ... "bar": [6, 7, 9, None], - ... "baz": [1, None, None, 9], - ... } - ... ) - >>> df.interpolate() - shape: (4, 3) - ┌──────┬──────┬──────────┐ - │ foo ┆ bar ┆ baz │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 │ - ╞══════╪══════╪══════════╡ - │ 1.0 ┆ 6.0 ┆ 1.0 │ - │ 5.0 ┆ 7.0 ┆ 3.666667 │ - │ 9.0 ┆ 9.0 ┆ 6.333333 │ - │ 10.0 ┆ null ┆ 9.0 │ - └──────┴──────┴──────────┘ - - ''' - def is_empty(self) -> bool: - ''' - Check if the dataframe is empty. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.is_empty() - False - >>> df.filter(pl.col("foo") > 99).is_empty() - True - - ''' - def to_struct(self, name: str) -> Series: - ''' - Convert a `DataFrame` to a `Series` of type `Struct`. - - Parameters - ---------- - name - Name for the struct Series - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4, 5], - ... "b": ["one", "two", "three", "four", "five"], - ... } - ... ) - >>> df.to_struct("nums") - shape: (5,) - Series: \'nums\' [struct[2]] - [ - {1,"one"} - {2,"two"} - {3,"three"} - {4,"four"} - {5,"five"} - ] - - ''' - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: - ''' - Decompose struct columns into separate columns for each of their fields. - - The new columns will be inserted into the dataframe at the location of the - struct column. - - Parameters - ---------- - columns - Name of the struct column(s) that should be unnested. - *more_columns - Additional columns to unnest, specified as positional arguments. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "before": ["foo", "bar"], - ... "t_a": [1, 2], - ... "t_b": ["a", "b"], - ... "t_c": [True, None], - ... "t_d": [[1, 2], [3]], - ... "after": ["baz", "womp"], - ... } - ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") - >>> df - shape: (2, 3) - ┌────────┬─────────────────────┬───────┐ - │ before ┆ t_struct ┆ after │ - │ --- ┆ --- ┆ --- │ - │ str ┆ struct[4] ┆ str │ - ╞════════╪═════════════════════╪═══════╡ - │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ - │ bar ┆ {2,"b",null,[3]} ┆ womp │ - └────────┴─────────────────────┴───────┘ - >>> df.unnest("t_struct") - shape: (2, 6) - ┌────────┬─────┬─────┬──────┬───────────┬───────┐ - │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ - ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ - │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ - │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ - └────────┴─────┴─────┴──────┴───────────┴───────┘ - - ''' - def corr(self, **kwargs: Any) -> DataFrame: - ''' - Return pairwise Pearson product-moment correlation coefficients between columns. - - See numpy `corrcoef` for more information: - https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html - - Notes - ----- - This functionality requires numpy to be installed. - - Parameters - ---------- - **kwargs - Keyword arguments are passed to numpy `corrcoef`. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) - >>> df.corr() - shape: (3, 3) - ┌──────┬──────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 │ - ╞══════╪══════╪══════╡ - │ 1.0 ┆ -1.0 ┆ 1.0 │ - │ -1.0 ┆ 1.0 ┆ -1.0 │ - │ 1.0 ┆ -1.0 ┆ 1.0 │ - └──────┴──────┴──────┘ - - ''' - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: - ''' - Take two sorted DataFrames and merge them by the sorted key. - - The output of this operation will also be sorted. - It is the callers responsibility that the frames are sorted - by that key otherwise the output will not make sense. - - The schemas of both DataFrames must be equal. - - Parameters - ---------- - other - Other DataFrame that must be merged - key - Key that is sorted. - - Examples - -------- - >>> df0 = pl.DataFrame( - ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} - ... ).sort("age") - >>> df0 - shape: (3, 2) - ┌───────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═══════╪═════╡ - │ bob ┆ 18 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └───────┴─────┘ - >>> df1 = pl.DataFrame( - ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} - ... ).sort("age") - >>> df1 - shape: (4, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - └────────┴─────┘ - >>> df0.merge_sorted(df1, key="age") - shape: (7, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ bob ┆ 18 │ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └────────┴─────┘ - ''' - def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: - """ - Indicate that one or multiple columns are sorted. - - Parameters - ---------- - column - Columns that are sorted - more_columns - Additional columns that are sorted, specified as positional arguments. - descending - Whether the columns are sorted in descending order. - """ - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: - ''' - Update the values in this `DataFrame` with the values in `other`. - - By default, null values in the right dataframe are ignored. Use - `ignore_nulls=False` to overwrite values in this frame with null values in other - frame. - - Notes - ----- - This is syntactic sugar for a left/inner join, with an optional coalesce when - `include_nulls = False`. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Parameters - ---------- - other - DataFrame that will be used to update the values - on - Column names that will be joined on. - If none given the row count is used. - left_on - Join column(s) of the left DataFrame. - right_on - Join column(s) of the right DataFrame. - how : {\'left\', \'inner\', \'outer\'} - * \'left\' will keep all rows from the left table; rows may be duplicated - if multiple rows in the right frame match the left row\'s key. - * \'inner\' keeps only those rows where the key exists in both frames. - * \'outer\' will update existing rows where the key matches while also - adding any new rows contained in the given frame. - include_nulls - If True, null values from the right dataframe will be used to update the - left dataframe. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4], - ... "B": [400, 500, 600, 700], - ... } - ... ) - >>> df - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 400 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - >>> new_df = pl.DataFrame( - ... { - ... "B": [-66, None, -99], - ... "C": [5, 3, 1], - ... } - ... ) - - Update `df` values with the non-null values in `new_df`, by row index: - - >>> df.update(new_df) - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, by row index, - but only keeping those rows that are common to both frames: - - >>> df.update(new_df, how="inner") - shape: (3, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") - shape: (5, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴─────┘ - - Update `df` values including null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> df.update( - ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True - ... ) - shape: (5, 2) - ┌─────┬──────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ null │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴──────┘ - - ''' - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: - """ - Start a group by operation. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.group_by`. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - .. note:: - Within each group, the order of rows is always preserved, regardless - of this argument. - - Returns - ------- - GroupBy - Object which can be used to perform aggregations. - - """ - def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - """ - def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.9 - This method has been renamed to :func:`DataFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - """ - def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.group_by_dynamic`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - include_boundaries - Add the lower and upper bound of the window to the "_lower_bound" and - "_upper_bound" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - DynamicGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - ''' - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: - """ - Apply a custom/user-defined function (UDF) over the rows of the DataFrame. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.map_rows`. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output type of the operation. If none given, Polars tries to infer the type. - inference_size - Only used in the case when the custom function returns rows. - This uses the first `n` rows to determine the output schema - - """ - def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - fill None values with this value. - n - Number of places to shift (may be negative). - - """ - def take_every(self, n: int) -> DataFrame: - """ - Take every nth row in the DataFrame and return as a new DataFrame. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - def find_idx_by_name(self, name: str) -> int: - """ - Find the index of a column by name. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`get_column_index`. - - Parameters - ---------- - name - Name of the column to find. - """ - def insert_at_idx(self, index: int, column: Series) -> Self: - """ - Insert a Series at a certain column index. This operation is in place. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`insert_column`. - - Parameters - ---------- - index - Column to insert the new `Series` column. - column - `Series` to insert. - """ - def replace_at_idx(self, index: int, new_column: Series) -> Self: - """ - Replace a column at an index location. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`replace_column`. - - Parameters - ---------- - index - Column index. - new_column - Series that will replace the column. - """ - def frame_equal(self, other: DataFrame) -> bool: - """ - Check whether the DataFrame is equal to another DataFrame. - - .. deprecated:: 0.19.16 - This method has been renamed to :func:`equals`. - - Parameters - ---------- - other - DataFrame to compare with. - null_equal - Consider null values as equal. - """ - @property - def shape(self): ... - @property - def height(self): ... - @property - def width(self): ... - @property - def dtypes(self): ... - @property - def flags(self): ... - @property - def schema(self): ... -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/dataframe/frame.pyi new file mode 100644 index 0000000..6ba7128 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/dataframe/frame.pyi @@ -0,0 +1,7092 @@ +#: version 0.20.0 +import P +import deltalake +import deltalake.table +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Enum as Enum, Float64 as Float64, Null as Null, Object as Object, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import ModuleUpgradeRequired as ModuleUpgradeRequired, NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, _warn_null_comparison as _warn_null_comparison, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PANDAS_AVAILABLE: bool +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use `pl.read_csv` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use `pl.read_parquet` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading `n_rows`. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use `pl.read_json` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use `pl.read_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with `NaN`. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to `True` will raise a `NotImplementedError`. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars DataFrame to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the DataFrame as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to `df[0,0]`, with a check that + the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are Series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + `structured` is set to `False` and the DataFrame dtypes allow for a + global dtype for all columns. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + function for the conversion to numpy if necessary. + + Notes + ----- + If you\'re attempting to convert Utf8 or Decimal to an array, you\'ll need to + install `pyarrow`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Utf8), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Utf8), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + separator or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path or writeable file-like object to which the data will be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + name + Schema name. Defaults to empty string. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open `xlsxwriter.Workbook` object that has not been closed. + If None, writes to a `dataframe.xlsx` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of `{"key":value,}` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. + column_formats : dict + A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. + dtype_formats : dict + A `{dtype:str,}` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + `column_formats` param). It is also valid to use dtype groups such as + `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid `xlsxwriter` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all `xlsxwriter` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A `{key:value,}` dictionary of `xlsxwriter` format options to apply + to the table header row, such as `{"bold":True, "font_color":"#702963"}`. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a `{colname:funcname,}` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A `{colname:int,}` or `{selector:int,}` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a `{colname:columns,}` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or `{row_index:int,}` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that `row_index` starts at zero and will be + the header row (unless `include_header` is False). + sparklines : dict + A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an `xlsxwriter`-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + include_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible `xlsxwriter` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic DataFrame: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC data will be + written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC record batch data will + be written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + data_page_size + Size of the data page in bytes. Defaults to 1024^2 bytes. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to `pyarrow.parquet.write_table`. + + If you pass `partition_cols` here, the dataset will be written + using `pyarrow.parquet.write_to_dataset`. + The `partition_cols` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, table_name: str, connection: str) -> int: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Schema-qualified name of the table to create or append to in the target + SQL database. If your table name contains special characters, it should + be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_table_exists : {\'append\', \'replace\', \'fail\'} + The insert mode: + + * \'replace\' will create a new database table, overwriting an existing one. + * \'append\' will append to an existing table. + * \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine to use for writing frame data. + + Returns + ------- + int + The number of rows affected, if the driver provides this information. + Otherwise, returns -1. + + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> deltalake.table.TableMerger | None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\', \'merge\'} + How to handle existing data. + + - If \'error\', throw an error if the table already exists (default). + - If \'append\', will add new data. + - If \'overwrite\', will replace table with new data. + - If \'ignore\', will not write anything if table already exists. + - If \'merge\', return a `TableMerger` object to merge data from the DataFrame + with the existing data. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + - See a list of supported storage options for S3 `here `__. + - See a list of supported storage options for GCS `here `__. + - See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + delta_merge_options + Keyword arguments which are required to `MERGE` a Delta lake Table. + See a list of supported merge options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + TableNotFoundError + If the delta table doesn\'t exist and MERGE action is triggered + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a DataFrame as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + Merge the DataFrame with an existing Delta Lake table. + For all `TableMerger` methods, check the deltalake docs + `here `__. + + Schema evolution is not yet supported in by the `deltalake` package, therefore + `overwrite_schema` will not have any effect on a merge operation. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> ( + ... df.write_delta( + ... "table_path", + ... mode="merge", + ... delta_merge_options={ + ... "predicate": "s.foo = t.foo", + ... "source_alias": "s", + ... "target_alias": "t", + ... }, + ... ) + ... .when_matched_update_all() + ... .when_not_matched_insert_all() + ... .execute() + ... ) # doctest: +SKIP + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_column(self, index: int, column: Series) -> Self: + ''' + Insert a Series at a certain column index. + + This operation is in place. + + Parameters + ---------- + index + Index at which to insert the new `Series` column. + column + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_column(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_column(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") > 1) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions, combined with and/or operators: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> df.filter( + ... pl.col("foo") <= 2, + ... ~pl.col("ham").is_in(["b", "c"]), + ... ) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> df.filter(foo=2, ham="b") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Warning + ------- + We will never guarantee the output of describe to be stable. + It will show statistics that we deem informative and may + be updated in the future. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "float": [1.0, 2.8, 3.0], + ... "int": [4, 5, None], + ... "bool": [True, False, True], + ... "str": [None, "b", "c"], + ... "str2": ["usd", "eur", None], + ... "date": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬───────┬──────┬──────┬────────────┐ + │ describe ┆ float ┆ int ┆ bool ┆ str ┆ str2 ┆ date │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ str ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪═══════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3 ┆ 2 ┆ 2 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ null ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ False ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 2.8 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ True ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴───────┴──────┴──────┴────────────┘ + + ''' + def get_column_index(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.get_column_index("ham") + 2 + + ''' + def replace_column(self, index: int, column: Series) -> Self: + ''' + Replace a column at an index location. + + This operation is in place. + + Parameters + ---------- + index + Column index. + column + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_column(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def equals(self, other: DataFrame) -> bool: + ''' + Check whether the DataFrame is equal to another DataFrame. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + See Also + -------- + assert_frame_equal + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.equals(df1) + True + >>> df1.equals(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The `GroupBy` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `group_by_dynamic` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling operation on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\', \'outer_coalesce\'} + Join strategy. + + * *inner* + Returns rows that have matching values in both tables + * *left* + Returns all rows from the left table, and the matched rows from the + right table + * *outer* + Returns all rows when there is a match in either left or right table + * *outer_coalesce* + Same as \'outer\', but coalesces the key columns + * *cross* + Returns the cartisian product of rows from both tables + * *semi* + Filter rows that have a match in the right table. + * *anti* + Filter rows that not have a match in the right table. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + join_nulls + Join on null values. By default null values will never produce matches. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 5) + ┌──────┬──────┬──────┬───────┬───────────┐ + │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞══════╪══════╪══════╪═══════╪═══════════╡ + │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │ + │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │ + │ null ┆ null ┆ null ┆ z ┆ d │ + │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │ + └──────┴──────┴──────┴───────┴───────────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see `pl.StringCache()`. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: `udf(row)`. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level `apply` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level `apply` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, other: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of + this `DataFrame`, `extend` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer `vstack` over `extend` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single `DataFrame`. In the latter case, finish the sequence of + `vstack` operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this DataFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Returns + ------- + Series + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill `value`. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> df.melt(id_vars="a", value_vars=cs.numeric()) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to `None` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying `as_dict=True`. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, n: int = ...) -> DataFrame: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> df.shift() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.shift(-2) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.shift(-2, fill_value=100) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`max_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def max_horizontal(self) -> Series: + ''' + Get the maximum value horizontally across columns. + + Returns + ------- + Series + A Series named `"max"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.max_horizontal() + shape: (3,) + Series: \'max\' [f64] + [ + 4.0 + 5.0 + 6.0 + ] + ''' + def min(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`min_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def min_horizontal(self) -> Series: + ''' + Get the minimum value horizontally across columns. + + Returns + ------- + Series + A Series named `"min"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.min_horizontal() + shape: (3,) + Series: \'min\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`sum_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + ''' + def sum_horizontal(self) -> Series: + ''' + Sum all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"sum"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.sum_horizontal() + shape: (3,) + Series: \'sum\' [f64] + [ + 5.0 + 7.0 + 9.0 + ] + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`mean_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + ''' + def mean_horizontal(self) -> Series: + ''' + Take the mean of all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"mean"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.mean_horizontal() + shape: (3,) + Series: \'mean\' [f64] + [ + 2.5 + 3.5 + 4.5 + ] + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to `None` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the `DataFrame` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + - Int8 + Utf8 = Utf8 + - Float32 + Int64 = Float32 + - Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The `index` and `by_predicate` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using `by_predicate` it is an error condition if anything other than + one row is returned; more than one row raises `TooManyRowsReturnedError`, and + zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of `iter_rows()` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify `named=True` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use `by_predicate` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using `iter_rows` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_columns(self) -> Iterator[Series]: + ''' + Returns an iterator over the DataFrame\'s columns. + + Notes + ----- + Consider whether you can use :func:`all` instead. + If you can, it will be more efficient. + + Returns + ------- + Iterator of Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [s.name for s in df.iter_columns()] + [\'a\', \'b\'] + + If you\'re using this to modify a dataframe\'s columns, e.g. + + >>> # Do NOT do this + >>> pl.DataFrame(column * 2 for column in df.iter_columns()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + + then consider whether you can use :func:`all` instead: + + >>> df.select(pl.all() * 2) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def gather_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.gather_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash_rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str = ...) -> Series: + ''' + Convert a `DataFrame` to a `Series` of type `Struct`. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy `corrcoef` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy `corrcoef`. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the values in `other`. + + .. warning:: + This functionality is experimental and may change without it being + considered a breaking change. + + By default, null values in the right frame are ignored. Use + `include_nulls=False` to overwrite values in this frame with + null values in the other frame. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce + when `include_nulls = False` + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> df.update(new_df, how="inner") + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update( + ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with this value. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> DataFrame: + """ + Take every nth row in the DataFrame and return as a new DataFrame. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def find_idx_by_name(self, name: str) -> int: + """ + Find the index of a column by name. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`get_column_index`. + + Parameters + ---------- + name + Name of the column to find. + """ + def insert_at_idx(self, index: int, column: Series) -> Self: + """ + Insert a Series at a certain column index. This operation is in place. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`insert_column`. + + Parameters + ---------- + index + Column to insert the new `Series` column. + column + `Series` to insert. + """ + def replace_at_idx(self, index: int, new_column: Series) -> Self: + """ + Replace a column at an index location. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`replace_column`. + + Parameters + ---------- + index + Column index. + new_column + Series that will replace the column. + """ + def frame_equal(self, other: DataFrame) -> bool: + """ + Check whether the DataFrame is equal to another DataFrame. + + .. deprecated:: 0.19.16 + This method has been renamed to :func:`equals`. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/expr/expr deleted file mode 100644 index 5131d44..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/expr/expr +++ /dev/null @@ -1,8289 +0,0 @@ -import P -import np as np -import pl -from builtins import PyExpr -from datetime import timedelta -from polars.datatypes.classes import Categorical as Categorical, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 -from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy -from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import no_default as no_default, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence - -TYPE_CHECKING: bool -py_arg_where: builtin_function_or_method -pyreduce: builtin_function_or_method - -class Expr: - _pyexpr: _ClassVar[None] = ... - _accessors: _ClassVar[set] = ... - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _repr_html_(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int | bool) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int | bool) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr | int | bool) -> Self: ... - def __rxor__(self, other: Any) -> Self: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: - """Numpy universal functions.""" - @classmethod - def from_json(cls, value: str) -> Self: - """ - Read an expression from a JSON encoded string to construct an Expression. - - Parameters - ---------- - value - JSON encoded string value - - """ - def to_physical(self) -> Self: - ''' - Cast to physical representation of the logical dtype. - - - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` - - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` - - `List(inner)` -> `List(physical of inner)` - - Other data types will be left unchanged. - - Examples - -------- - Replicating the pandas - `pd.factorize - `_ - function. - - >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( - ... [ - ... pl.col("vals").cast(pl.Categorical), - ... pl.col("vals") - ... .cast(pl.Categorical) - ... .to_physical() - ... .alias("vals_physical"), - ... ] - ... ) - shape: (4, 2) - ┌──────┬───────────────┐ - │ vals ┆ vals_physical │ - │ --- ┆ --- │ - │ cat ┆ u32 │ - ╞══════╪═══════════════╡ - │ a ┆ 0 │ - │ x ┆ 1 │ - │ null ┆ null │ - │ a ┆ 0 │ - └──────┴───────────────┘ - - ''' - def any(self) -> Self: - ''' - Return whether any of the values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is null. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [True, False], - ... "b": [False, False], - ... "c": [None, False], - ... } - ... ) - >>> df.select(pl.col("*").any()) - shape: (1, 3) - ┌──────┬───────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪═══════╡ - │ true ┆ false ┆ false │ - └──────┴───────┴───────┘ - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> df.select(pl.col("*").any(ignore_nulls=False)) - shape: (1, 3) - ┌──────┬───────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪══════╡ - │ true ┆ false ┆ null │ - └──────┴───────┴──────┘ - - ''' - def all(self) -> Self: - ''' - Return whether all values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - .. note:: - This method is not to be confused with the function :func:`polars.all`, - which can be used to select all columns. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is null. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [True, True], - ... "b": [False, True], - ... "c": [None, True], - ... } - ... ) - >>> df.select(pl.col("*").all()) - shape: (1, 3) - ┌──────┬───────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪══════╡ - │ true ┆ false ┆ true │ - └──────┴───────┴──────┘ - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> df.select(pl.col("*").all(ignore_nulls=False)) - shape: (1, 3) - ┌──────┬───────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪══════╡ - │ true ┆ false ┆ null │ - └──────┴───────┴──────┘ - - ''' - def arg_true(self) -> Self: - ''' - Return indices where expression evaluates `True`. - - .. warning:: - Modifies number of rows returned, so will fail in combination with other - expressions. Use as only expression in `select` / `with_columns`. - - See Also - -------- - Series.arg_true : Return indices where Series is True - polars.arg_where - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) - >>> df.select((pl.col("a") == 1).arg_true()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 0 │ - │ 1 │ - │ 3 │ - └─────┘ - - ''' - def sqrt(self) -> Self: - ''' - Compute the square root of the elements. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").sqrt()) - shape: (3, 1) - ┌──────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.0 │ - │ 1.414214 │ - │ 2.0 │ - └──────────┘ - - ''' - def cbrt(self) -> Self: - ''' - Compute the cube root of the elements. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").cbrt()) - shape: (3, 1) - ┌──────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.0 │ - │ 1.259921 │ - │ 1.587401 │ - └──────────┘ - - ''' - def log10(self) -> Self: - ''' - Compute the base 10 logarithm of the input array, element-wise. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").log10()) - shape: (3, 1) - ┌─────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞═════════╡ - │ 0.0 │ - │ 0.30103 │ - │ 0.60206 │ - └─────────┘ - - ''' - def exp(self) -> Self: - ''' - Compute the exponential, element-wise. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").exp()) - shape: (3, 1) - ┌──────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 2.718282 │ - │ 7.389056 │ - │ 54.59815 │ - └──────────┘ - - ''' - def alias(self, name: str) -> Self: - ''' - Rename the expression. - - Parameters - ---------- - name - The new name. - - See Also - -------- - map - prefix - suffix - - Examples - -------- - Rename an expression to avoid overwriting an existing column. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["x", "y", "z"], - ... } - ... ) - >>> df.with_columns( - ... pl.col("a") + 10, - ... pl.col("b").str.to_uppercase().alias("c"), - ... ) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 11 ┆ x ┆ X │ - │ 12 ┆ y ┆ Y │ - │ 13 ┆ z ┆ Z │ - └─────┴─────┴─────┘ - - Overwrite the default name of literal columns to prevent errors due to duplicate - column names. - - >>> df.with_columns( - ... pl.lit(True).alias("c"), - ... pl.lit(4.0).alias("d"), - ... ) - shape: (3, 4) - ┌─────┬─────┬──────┬─────┐ - │ a ┆ b ┆ c ┆ d │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ bool ┆ f64 │ - ╞═════╪═════╪══════╪═════╡ - │ 1 ┆ x ┆ true ┆ 4.0 │ - │ 2 ┆ y ┆ true ┆ 4.0 │ - │ 3 ┆ z ┆ true ┆ 4.0 │ - └─────┴─────┴──────┴─────┘ - - ''' - def map_alias(self, function: Callable[[str], str]) -> Self: - ''' - Rename the output of an expression by mapping a function over the root name. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.map`. - - Parameters - ---------- - function - Function that maps a root name to a new name. - - See Also - -------- - keep_name - prefix - suffix - - Examples - -------- - Remove a common suffix and convert to lower case. - - >>> df = pl.DataFrame( - ... { - ... "A_reverse": [3, 2, 1], - ... "B_reverse": ["z", "y", "x"], - ... } - ... ) - >>> df.with_columns( - ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) - ... ) - shape: (3, 4) - ┌───────────┬───────────┬─────┬─────┐ - │ A_reverse ┆ B_reverse ┆ a ┆ b │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═══════════╪═══════════╪═════╪═════╡ - │ 3 ┆ z ┆ 1 ┆ x │ - │ 2 ┆ y ┆ 2 ┆ y │ - │ 1 ┆ x ┆ 3 ┆ z │ - └───────────┴───────────┴─────┴─────┘ - - ''' - def prefix(self, prefix: str) -> Self: - ''' - Add a prefix to the root column name of the expression. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.prefix`. - - Parameters - ---------- - prefix - Prefix to add to the root column name. - - Notes - ----- - This will undo any previous renaming operations on the expression. - - Due to implementation constraints, this method can only be called as the last - expression in a chain. - - See Also - -------- - suffix - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["x", "y", "z"], - ... } - ... ) - >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) - shape: (3, 4) - ┌─────┬─────┬───────────┬───────────┐ - │ a ┆ b ┆ reverse_a ┆ reverse_b │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪═════╪═══════════╪═══════════╡ - │ 1 ┆ x ┆ 3 ┆ z │ - │ 2 ┆ y ┆ 2 ┆ y │ - │ 3 ┆ z ┆ 1 ┆ x │ - └─────┴─────┴───────────┴───────────┘ - - ''' - def suffix(self, suffix: str) -> Self: - ''' - Add a suffix to the root column name of the expression. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.suffix`. - - Parameters - ---------- - suffix - Suffix to add to the root column name. - - Notes - ----- - This will undo any previous renaming operations on the expression. - - Due to implementation constraints, this method can only be called as the last - expression in a chain. - - See Also - -------- - prefix - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["x", "y", "z"], - ... } - ... ) - >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) - shape: (3, 4) - ┌─────┬─────┬───────────┬───────────┐ - │ a ┆ b ┆ a_reverse ┆ b_reverse │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪═════╪═══════════╪═══════════╡ - │ 1 ┆ x ┆ 3 ┆ z │ - │ 2 ┆ y ┆ 2 ┆ y │ - │ 3 ┆ z ┆ 1 ┆ x │ - └─────┴─────┴───────────┴───────────┘ - - ''' - def keep_name(self) -> Self: - ''' - Keep the original root name of the expression. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.keep`. - - Notes - ----- - Due to implementation constraints, this method can only be called as the last - expression in a chain. - - See Also - -------- - alias - - Examples - -------- - Undo an alias operation. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2], - ... "b": [3, 4], - ... } - ... ) - >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 9 ┆ 3 │ - │ 18 ┆ 4 │ - └─────┴─────┘ - - Prevent errors due to duplicate column names. - - >>> df.select((pl.lit(10) / pl.all()).name.keep()) - shape: (2, 2) - ┌──────┬──────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪══════════╡ - │ 10.0 ┆ 3.333333 │ - │ 5.0 ┆ 2.5 │ - └──────┴──────────┘ - - ''' - def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: - ''' - Exclude columns from a multi-column expression. - - Only works after a wildcard or regex column selection, and you cannot provide - both string column names *and* dtypes (you may prefer to use selectors instead). - - Parameters - ---------- - columns - The name or datatype of the column(s) to exclude. Accepts regular expression - input. Regular expressions should start with `^` and end with `$`. - *more_columns - Additional names or datatypes of columns to exclude, specified as positional - arguments. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "aa": [1, 2, 3], - ... "ba": ["a", "b", None], - ... "cc": [None, 2.5, 1.5], - ... } - ... ) - >>> df - shape: (3, 3) - ┌─────┬──────┬──────┐ - │ aa ┆ ba ┆ cc │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ f64 │ - ╞═════╪══════╪══════╡ - │ 1 ┆ a ┆ null │ - │ 2 ┆ b ┆ 2.5 │ - │ 3 ┆ null ┆ 1.5 │ - └─────┴──────┴──────┘ - - Exclude by column name(s): - - >>> df.select(pl.all().exclude("ba")) - shape: (3, 2) - ┌─────┬──────┐ - │ aa ┆ cc │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ null │ - │ 2 ┆ 2.5 │ - │ 3 ┆ 1.5 │ - └─────┴──────┘ - - Exclude by regex, e.g. removing all columns whose names end with the letter "a": - - >>> df.select(pl.all().exclude("^.*a$")) - shape: (3, 1) - ┌──────┐ - │ cc │ - │ --- │ - │ f64 │ - ╞══════╡ - │ null │ - │ 2.5 │ - │ 1.5 │ - └──────┘ - - Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: - - >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) - shape: (3, 1) - ┌──────┐ - │ ba │ - │ --- │ - │ str │ - ╞══════╡ - │ a │ - │ b │ - │ null │ - └──────┘ - - ''' - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: - ''' - Offers a structured way to apply a sequence of user-defined functions (UDFs). - - Parameters - ---------- - function - Callable; will receive the expression as the first parameter, - followed by any given args/kwargs. - *args - Arguments to pass to the UDF. - **kwargs - Keyword arguments to pass to the UDF. - - Examples - -------- - >>> def extract_number(expr: pl.Expr) -> pl.Expr: - ... """Extract the digits from a string.""" - ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) - >>> - >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: - ... """Set even numbers negative, and scale by a user-supplied value.""" - ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) - ... return expr * n - >>> - >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) - >>> df.with_columns( - ... udfs=( - ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) - ... ), - ... ) - shape: (4, 2) - ┌──────┬──────┐ - │ val ┆ udfs │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞══════╪══════╡ - │ a: 1 ┆ 5 │ - │ b: 2 ┆ -10 │ - │ c: 3 ┆ 15 │ - │ d: 4 ┆ -20 │ - └──────┴──────┘ - - ''' - def is_not(self) -> Self: - """ - Negate a boolean expression. - - .. deprecated:: 0.19.2 - This method has been renamed to :func:`Expr.not_`. - - """ - def not_(self) -> Self: - ''' - Negate a boolean expression. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [True, False, False], - ... "b": ["a", "b", None], - ... } - ... ) - >>> df - shape: (3, 2) - ┌───────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ bool ┆ str │ - ╞═══════╪══════╡ - │ true ┆ a │ - │ false ┆ b │ - │ false ┆ null │ - └───────┴──────┘ - >>> df.select(pl.col("a").not_()) - shape: (3, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ true │ - │ true │ - └───────┘ - - ''' - def is_null(self) -> Self: - ''' - Returns a boolean Series indicating which values are null. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null - shape: (5, 4) - ┌──────┬─────┬──────────┬──────────┐ - │ a ┆ b ┆ a_isnull ┆ b_isnull │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪═════╪══════════╪══════════╡ - │ 1 ┆ 1.0 ┆ false ┆ false │ - │ 2 ┆ 2.0 ┆ false ┆ false │ - │ null ┆ NaN ┆ true ┆ false │ - │ 1 ┆ 1.0 ┆ false ┆ false │ - │ 5 ┆ 5.0 ┆ false ┆ false │ - └──────┴─────┴──────────┴──────────┘ - - ''' - def is_not_null(self) -> Self: - ''' - Returns a boolean Series indicating which values are not null. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns( - ... pl.all().is_not_null().name.suffix("_not_null") # nan != null - ... ) - shape: (5, 4) - ┌──────┬─────┬────────────┬────────────┐ - │ a ┆ b ┆ a_not_null ┆ b_not_null │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪═════╪════════════╪════════════╡ - │ 1 ┆ 1.0 ┆ true ┆ true │ - │ 2 ┆ 2.0 ┆ true ┆ true │ - │ null ┆ NaN ┆ false ┆ true │ - │ 1 ┆ 1.0 ┆ true ┆ true │ - │ 5 ┆ 5.0 ┆ true ┆ true │ - └──────┴─────┴────────────┴────────────┘ - - ''' - def is_finite(self) -> Self: - ''' - Returns a boolean Series indicating which values are finite. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1.0, 2], - ... "B": [3.0, float("inf")], - ... } - ... ) - >>> df.select(pl.all().is_finite()) - shape: (2, 2) - ┌──────┬───────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ bool ┆ bool │ - ╞══════╪═══════╡ - │ true ┆ true │ - │ true ┆ false │ - └──────┴───────┘ - - ''' - def is_infinite(self) -> Self: - ''' - Returns a boolean Series indicating which values are infinite. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1.0, 2], - ... "B": [3.0, float("inf")], - ... } - ... ) - >>> df.select(pl.all().is_infinite()) - shape: (2, 2) - ┌───────┬───────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ bool ┆ bool │ - ╞═══════╪═══════╡ - │ false ┆ false │ - │ false ┆ true │ - └───────┴───────┘ - - ''' - def is_nan(self) -> Self: - ''' - Returns a boolean Series indicating which values are NaN. - - Notes - ----- - Floating point `NaN` (Not A Number) should not be confused - with missing data represented as `Null/None`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) - shape: (5, 3) - ┌──────┬─────┬─────────┐ - │ a ┆ b ┆ b_isnan │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪═════╪═════════╡ - │ 1 ┆ 1.0 ┆ false │ - │ 2 ┆ 2.0 ┆ false │ - │ null ┆ NaN ┆ true │ - │ 1 ┆ 1.0 ┆ false │ - │ 5 ┆ 5.0 ┆ false │ - └──────┴─────┴─────────┘ - - ''' - def is_not_nan(self) -> Self: - ''' - Returns a boolean Series indicating which values are not NaN. - - Notes - ----- - Floating point `NaN` (Not A Number) should not be confused - with missing data represented as `Null/None`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) - shape: (5, 3) - ┌──────┬─────┬──────────────┐ - │ a ┆ b ┆ b_is_not_nan │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪═════╪══════════════╡ - │ 1 ┆ 1.0 ┆ true │ - │ 2 ┆ 2.0 ┆ true │ - │ null ┆ NaN ┆ false │ - │ 1 ┆ 1.0 ┆ true │ - │ 5 ┆ 5.0 ┆ true │ - └──────┴─────┴──────────────┘ - - ''' - def agg_groups(self) -> Self: - ''' - Get the group indexes of the group by operation. - - Should be used in aggregation context only. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": [ - ... "one", - ... "one", - ... "one", - ... "two", - ... "two", - ... "two", - ... ], - ... "value": [94, 95, 96, 97, 97, 99], - ... } - ... ) - >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ list[u32] │ - ╞═══════╪═══════════╡ - │ one ┆ [0, 1, 2] │ - │ two ┆ [3, 4, 5] │ - └───────┴───────────┘ - - ''' - def count(self) -> Self: - ''' - Return the number of elements in the column. - - .. warning:: - Null values are treated like regular elements in this context. - - Examples - -------- - >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) - >>> df.select(pl.all().count()) - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 3 ┆ 3 │ - └─────┴─────┘ - - ''' - def len(self) -> Self: - ''' - Return the number of elements in the column. - - Null values are treated like regular elements in this context. - - Alias for :func:`count`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) - >>> df.select(pl.all().len()) - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 3 ┆ 3 │ - └─────┴─────┘ - - ''' - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: - ''' - Get a slice of this expression. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [8, 9, 10, 11], - ... "b": [None, 4, 4, 4], - ... } - ... ) - >>> df.select(pl.all().slice(1, 2)) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 9 ┆ 4 │ - │ 10 ┆ 4 │ - └─────┴─────┘ - - ''' - def append(self, other: IntoExpr) -> Self: - ''' - Append expressions. - - This is done by adding the chunks of `other` to this `Series`. - - Parameters - ---------- - other - Expression to append. - upcast - Cast both `Series` to the same supertype. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [8, 9, 10], - ... "b": [None, 4, 4], - ... } - ... ) - >>> df.select(pl.all().head(1).append(pl.all().tail(1))) - shape: (2, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 8 ┆ null │ - │ 10 ┆ 4 │ - └─────┴──────┘ - - ''' - def rechunk(self) -> Self: - ''' - Create a single chunk of memory for this Series. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - - Create a Series with 3 nulls, append column a then rechunk - - >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) - shape: (6, 1) - ┌────────┐ - │ repeat │ - │ --- │ - │ i64 │ - ╞════════╡ - │ null │ - │ null │ - │ null │ - │ 1 │ - │ 1 │ - │ 2 │ - └────────┘ - - ''' - def drop_nulls(self) -> Self: - ''' - Drop all null values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nans - - Notes - ----- - A null value is not the same as a NaN value. - To drop NaN values, use :func:`drop_nans`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) - >>> df.select(pl.col("a").drop_nulls()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - │ 3.0 │ - │ NaN │ - └─────┘ - - ''' - def drop_nans(self) -> Self: - ''' - Drop all floating point NaN values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nulls - - Notes - ----- - A NaN value is not the same as a null value. - To drop null values, use :func:`drop_nulls`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) - >>> df.select(pl.col("a").drop_nans()) - shape: (3, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ 1.0 │ - │ null │ - │ 3.0 │ - └──────┘ - - ''' - def cum_sum(self) -> Self: - ''' - Get an array with the cumulative sum computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_sum().alias("cum_sum"), - ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), - ... ) - shape: (4, 3) - ┌─────┬─────────┬─────────────────┐ - │ a ┆ cum_sum ┆ cum_sum_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════════╪═════════════════╡ - │ 1 ┆ 1 ┆ 10 │ - │ 2 ┆ 3 ┆ 9 │ - │ 3 ┆ 6 ┆ 7 │ - │ 4 ┆ 10 ┆ 4 │ - └─────┴─────────┴─────────────────┘ - - Null values are excluded, but can also be filled by calling `forward_fill`. - - >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) - >>> df.with_columns( - ... pl.col("values").cum_sum().alias("value_cum_sum"), - ... pl.col("values") - ... .cum_sum() - ... .forward_fill() - ... .alias("value_cum_sum_all_filled"), - ... ) - shape: (8, 3) - ┌────────┬───────────────┬──────────────────────────┐ - │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞════════╪═══════════════╪══════════════════════════╡ - │ null ┆ null ┆ null │ - │ 10 ┆ 10 ┆ 10 │ - │ null ┆ null ┆ 10 │ - │ 8 ┆ 18 ┆ 18 │ - │ 9 ┆ 27 ┆ 27 │ - │ null ┆ null ┆ 27 │ - │ 16 ┆ 43 ┆ 43 │ - │ null ┆ null ┆ 43 │ - └────────┴───────────────┴──────────────────────────┘ - - ''' - def cum_prod(self) -> Self: - ''' - Get an array with the cumulative product computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_prod().alias("cum_prod"), - ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), - ... ) - shape: (4, 3) - ┌─────┬──────────┬──────────────────┐ - │ a ┆ cum_prod ┆ cum_prod_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪══════════╪══════════════════╡ - │ 1 ┆ 1 ┆ 24 │ - │ 2 ┆ 2 ┆ 24 │ - │ 3 ┆ 6 ┆ 12 │ - │ 4 ┆ 24 ┆ 4 │ - └─────┴──────────┴──────────────────┘ - - ''' - def cum_min(self) -> Self: - ''' - Get an array with the cumulative min computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_min().alias("cum_min"), - ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), - ... ) - shape: (4, 3) - ┌─────┬─────────┬─────────────────┐ - │ a ┆ cum_min ┆ cum_min_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════════╪═════════════════╡ - │ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 1 ┆ 2 │ - │ 3 ┆ 1 ┆ 3 │ - │ 4 ┆ 1 ┆ 4 │ - └─────┴─────────┴─────────────────┘ - - ''' - def cum_max(self) -> Self: - ''' - Get an array with the cumulative max computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_max().alias("cum_max"), - ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), - ... ) - shape: (4, 3) - ┌─────┬─────────┬─────────────────┐ - │ a ┆ cum_max ┆ cum_max_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════════╪═════════════════╡ - │ 1 ┆ 1 ┆ 4 │ - │ 2 ┆ 2 ┆ 4 │ - │ 3 ┆ 3 ┆ 4 │ - │ 4 ┆ 4 ┆ 4 │ - └─────┴─────────┴─────────────────┘ - - Null values are excluded, but can also be filled by calling `forward_fill`. - - >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) - >>> df.with_columns( - ... pl.col("values").cum_max().alias("cum_max"), - ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), - ... ) - shape: (8, 3) - ┌────────┬─────────┬────────────────────┐ - │ values ┆ cum_max ┆ cum_max_all_filled │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞════════╪═════════╪════════════════════╡ - │ null ┆ null ┆ null │ - │ 10 ┆ 10 ┆ 10 │ - │ null ┆ null ┆ 10 │ - │ 8 ┆ 10 ┆ 10 │ - │ 9 ┆ 10 ┆ 10 │ - │ null ┆ null ┆ 10 │ - │ 16 ┆ 16 ┆ 16 │ - │ null ┆ null ┆ 16 │ - └────────┴─────────┴────────────────────┘ - - ''' - def cum_count(self) -> Self: - ''' - Get an array with the cumulative count computed at every element. - - Counting from 0 to len - - Parameters - ---------- - reverse - Reverse the operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_count().alias("cum_count"), - ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), - ... ) - shape: (4, 3) - ┌─────┬───────────┬───────────────────┐ - │ a ┆ cum_count ┆ cum_count_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ u32 ┆ u32 │ - ╞═════╪═══════════╪═══════════════════╡ - │ 1 ┆ 0 ┆ 3 │ - │ 2 ┆ 1 ┆ 2 │ - │ 3 ┆ 2 ┆ 1 │ - │ 4 ┆ 3 ┆ 0 │ - └─────┴───────────┴───────────────────┘ - - ''' - def floor(self) -> Self: - ''' - Rounds down to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) - >>> df.select(pl.col("a").floor()) - shape: (4, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - │ 0.0 │ - │ 1.0 │ - │ 1.0 │ - └─────┘ - - ''' - def ceil(self) -> Self: - ''' - Rounds up to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) - >>> df.select(pl.col("a").ceil()) - shape: (4, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - │ 1.0 │ - │ 1.0 │ - │ 2.0 │ - └─────┘ - - ''' - def round(self, decimals: int = ...) -> Self: - ''' - Round underlying floating point data by `decimals` digits. - - Parameters - ---------- - decimals - Number of decimals to round by. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) - >>> df.select(pl.col("a").round(1)) - shape: (4, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.3 │ - │ 0.5 │ - │ 1.0 │ - │ 1.2 │ - └─────┘ - - ''' - def round_sig_figs(self, digits: int) -> Self: - ''' - Round to a number of significant figures. - - Parameters - ---------- - digits - Number of significant figures to round to. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) - >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) - shape: (3, 2) - ┌─────────┬────────────────┐ - │ a ┆ round_sig_figs │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════════╪════════════════╡ - │ 0.01234 ┆ 0.012 │ - │ 3.333 ┆ 3.3 │ - │ 1234.0 ┆ 1200.0 │ - └─────────┴────────────────┘ - - ''' - def dot(self, other: Expr | str) -> Self: - ''' - Compute the dot/inner product between two Expressions. - - Parameters - ---------- - other - Expression to compute dot product with. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> df.select(pl.col("a").dot(pl.col("b"))) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 44 │ - └─────┘ - - ''' - def mode(self) -> Self: - ''' - Compute the most occurring value(s). - - Can return multiple Values. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 1, 2, 3], - ... "b": [1, 1, 2, 2], - ... } - ... ) - >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 1 │ - │ 1 ┆ 2 │ - └─────┴─────┘ - - ''' - def cast(self, dtype: PolarsDataType | type[Any]) -> Self: - ''' - Cast between data types. - - Parameters - ---------- - dtype - DataType to cast to. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["4", "5", "6"], - ... } - ... ) - >>> df.with_columns( - ... [ - ... pl.col("a").cast(pl.Float64), - ... pl.col("b").cast(pl.Int32), - ... ] - ... ) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ i32 │ - ╞═════╪═════╡ - │ 1.0 ┆ 4 │ - │ 2.0 ┆ 5 │ - │ 3.0 ┆ 6 │ - └─────┴─────┘ - - ''' - def sort(self) -> Self: - ''' - Sort this column. - - When used in a projection/selection context, the whole column is sorted. - When used in a group by context, the groups are sorted. - - Parameters - ---------- - descending - Sort in descending order. - nulls_last - Place null values last. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, None, 3, 2], - ... } - ... ) - >>> df.select(pl.col("a").sort()) - shape: (4, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ null │ - │ 1 │ - │ 2 │ - │ 3 │ - └──────┘ - >>> df.select(pl.col("a").sort(descending=True)) - shape: (4, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ null │ - │ 3 │ - │ 2 │ - │ 1 │ - └──────┘ - >>> df.select(pl.col("a").sort(nulls_last=True)) - shape: (4, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ 1 │ - │ 2 │ - │ 3 │ - │ null │ - └──────┘ - - When sorting in a group by context, the groups are sorted. - - >>> df = pl.DataFrame( - ... { - ... "group": ["one", "one", "one", "two", "two", "two"], - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT - shape: (2, 2) - ┌───────┬────────────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪════════════╡ - │ two ┆ [3, 4, 99] │ - │ one ┆ [1, 2, 98] │ - └───────┴────────────┘ - - ''' - def top_k(self, k: int | IntoExprColumn = ...) -> Self: - ''' - Return the `k` largest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - bottom_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("value").top_k().alias("top_k"), - ... pl.col("value").bottom_k().alias("bottom_k"), - ... ] - ... ) - shape: (5, 2) - ┌───────┬──────────┐ - │ top_k ┆ bottom_k │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═══════╪══════════╡ - │ 99 ┆ 1 │ - │ 98 ┆ 2 │ - │ 4 ┆ 3 │ - │ 3 ┆ 4 │ - │ 2 ┆ 98 │ - └───────┴──────────┘ - - ''' - def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: - ''' - Return the `k` smallest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - top_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("value").top_k().alias("top_k"), - ... pl.col("value").bottom_k().alias("bottom_k"), - ... ] - ... ) - shape: (5, 2) - ┌───────┬──────────┐ - │ top_k ┆ bottom_k │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═══════╪══════════╡ - │ 99 ┆ 1 │ - │ 98 ┆ 2 │ - │ 4 ┆ 3 │ - │ 3 ┆ 4 │ - │ 2 ┆ 98 │ - └───────┴──────────┘ - - ''' - def arg_sort(self) -> Self: - ''' - Get the index values that would sort this column. - - Parameters - ---------- - descending - Sort in descending (descending) order. - nulls_last - Place null values last instead of first. - - Returns - ------- - Expr - Expression of data type :class:`UInt32`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [20, 10, 30], - ... } - ... ) - >>> df.select(pl.col("a").arg_sort()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 1 │ - │ 0 │ - │ 2 │ - └─────┘ - - ''' - def arg_max(self) -> Self: - ''' - Get the index of the maximal value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [20, 10, 30], - ... } - ... ) - >>> df.select(pl.col("a").arg_max()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def arg_min(self) -> Self: - ''' - Get the index of the minimal value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [20, 10, 30], - ... } - ... ) - >>> df.select(pl.col("a").arg_min()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 1 │ - └─────┘ - - ''' - def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: - ''' - Find indices where elements should be inserted to maintain order. - - .. math:: a[i-1] < v <= a[i] - - Parameters - ---------- - element - Expression or scalar value. - side : {\'any\', \'left\', \'right\'} - If \'any\', the index of the first suitable location found is given. - If \'left\', the index of the leftmost suitable location found is given. - If \'right\', return the rightmost suitable location found is given. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "values": [1, 2, 3, 5], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("values").search_sorted(0).alias("zero"), - ... pl.col("values").search_sorted(3).alias("three"), - ... pl.col("values").search_sorted(6).alias("six"), - ... ] - ... ) - shape: (1, 3) - ┌──────┬───────┬─────┐ - │ zero ┆ three ┆ six │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ u32 │ - ╞══════╪═══════╪═════╡ - │ 0 ┆ 2 ┆ 4 │ - └──────┴───────┴─────┘ - - ''' - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: - ''' - Sort this column by the ordering of other columns. - - When used in a projection/selection context, the whole column is sorted. - When used in a group by context, the groups are sorted. - - Parameters - ---------- - by - Column(s) to sort by. Accepts expression input. Strings are parsed as column - names. - *more_by - Additional columns to sort by, specified as positional arguments. - descending - Sort in descending order. When sorting by multiple columns, can be specified - per column by passing a sequence of booleans. - - Examples - -------- - Pass a single column name to sort by that column. - - >>> df = pl.DataFrame( - ... { - ... "group": ["a", "a", "b", "b"], - ... "value1": [1, 3, 4, 2], - ... "value2": [8, 7, 6, 5], - ... } - ... ) - >>> df.select(pl.col("group").sort_by("value1")) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ a │ - │ b │ - │ a │ - │ b │ - └───────┘ - - Sorting by expressions is also supported. - - >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ b │ - │ a │ - │ a │ - │ b │ - └───────┘ - - Sort by multiple columns by passing a list of columns. - - >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ b │ - │ a │ - │ b │ - │ a │ - └───────┘ - - Or use positional arguments to sort by multiple columns in the same way. - - >>> df.select(pl.col("group").sort_by("value1", "value2")) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ a │ - │ b │ - │ a │ - │ b │ - └───────┘ - - When sorting in a group by context, the groups are sorted. - - >>> df.group_by("group").agg( - ... pl.col("value1").sort_by("value2") - ... ) # doctest: +IGNORE_RESULT - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ value1 │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪═══════════╡ - │ a ┆ [3, 1] │ - │ b ┆ [2, 4] │ - └───────┴───────────┘ - - Take a single row from each group where a column attains its minimal value - within that group. - - >>> df.group_by("group").agg( - ... pl.all().sort_by("value2").first() - ... ) # doctest: +IGNORE_RESULT - shape: (2, 3) - ┌───────┬────────┬────────┐ - │ group ┆ value1 ┆ value2 | - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 | - ╞═══════╪════════╪════════╡ - │ a ┆ 3 ┆ 7 | - │ b ┆ 2 ┆ 5 | - └───────┴────────┴────────┘ - - ''' - def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: - ''' - Take values by index. - - Parameters - ---------- - indices - An expression that leads to a UInt32 dtyped Series. - - Returns - ------- - Expr - Expression of the same data type. - - See Also - -------- - Expr.get : Take a single value - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": [ - ... "one", - ... "one", - ... "one", - ... "two", - ... "two", - ... "two", - ... ], - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.group_by("group", maintain_order=True).agg( - ... pl.col("value").gather([2, 1]) - ... ) - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪═══════════╡ - │ one ┆ [2, 98] │ - │ two ┆ [4, 99] │ - └───────┴───────────┘ - ''' - def get(self, index: int | Expr) -> Self: - ''' - Return a single value by index. - - Parameters - ---------- - index - An expression that leads to a UInt32 index. - - Returns - ------- - Expr - Expression of the same data type. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": [ - ... "one", - ... "one", - ... "one", - ... "two", - ... "two", - ... "two", - ... ], - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) - shape: (2, 2) - ┌───────┬───────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═══════╪═══════╡ - │ one ┆ 98 │ - │ two ┆ 99 │ - └───────┴───────┘ - - ''' - def shift(self, n: int | IntoExprColumn = ...) -> Self: - ''' - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns(shift=pl.col("a").shift()) - shape: (4, 2) - ┌─────┬───────┐ - │ a ┆ shift │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═══════╡ - │ 1 ┆ null │ - │ 2 ┆ 1 │ - │ 3 ┆ 2 │ - │ 4 ┆ 3 │ - └─────┴───────┘ - - Pass a negative value to shift in the opposite direction instead. - - >>> df.with_columns(shift=pl.col("a").shift(-2)) - shape: (4, 2) - ┌─────┬───────┐ - │ a ┆ shift │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═══════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - │ 3 ┆ null │ - │ 4 ┆ null │ - └─────┴───────┘ - - Specify `fill_value` to fill the resulting null values. - - >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) - shape: (4, 2) - ┌─────┬───────┐ - │ a ┆ shift │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═══════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - │ 3 ┆ 100 │ - │ 4 ┆ 100 │ - └─────┴───────┘ - - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: - ''' - Fill null values using the specified value or strategy. - - To interpolate over null values see interpolate. - See the examples below to fill nulls with an expression. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [4, None, 6], - ... } - ... ) - >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 0 │ - │ null ┆ 6 │ - └──────┴─────┘ - >>> df.with_columns(pl.col("b").fill_null(99)) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 99 │ - │ null ┆ 6 │ - └──────┴─────┘ - >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 4 │ - │ null ┆ 6 │ - └──────┴─────┘ - >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞══════╪═════╡ - │ 1 ┆ 4.0 │ - │ 2 ┆ 5.0 │ - │ null ┆ 6.0 │ - └──────┴─────┘ - >>> df.with_columns(pl.all().fill_null(pl.all().median())) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 1.0 ┆ 4.0 │ - │ 2.0 ┆ 5.0 │ - │ 1.5 ┆ 6.0 │ - └─────┴─────┘ - - ''' - def fill_nan(self, value: int | float | Expr | None) -> Self: - ''' - Fill floating point NaN value with a fill value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1.0, None, float("nan")], - ... "b": [4.0, float("nan"), 6], - ... } - ... ) - >>> df.with_columns(pl.col("b").fill_nan(0)) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪═════╡ - │ 1.0 ┆ 4.0 │ - │ null ┆ 0.0 │ - │ NaN ┆ 6.0 │ - └──────┴─────┘ - - ''' - def forward_fill(self, limit: int | None = ...) -> Self: - ''' - Fill missing values with the latest seen values. - - Parameters - ---------- - limit - The number of consecutive null values to forward fill. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [4, None, 6], - ... } - ... ) - >>> df.select(pl.all().forward_fill()) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 4 │ - │ 2 ┆ 6 │ - └─────┴─────┘ - - ''' - def backward_fill(self, limit: int | None = ...) -> Self: - ''' - Fill missing values with the next to be seen values. - - Parameters - ---------- - limit - The number of consecutive null values to backward fill. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [4, None, 6], - ... "c": [None, None, 2], - ... } - ... ) - >>> df.select(pl.all().backward_fill()) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞══════╪═════╪═════╡ - │ 1 ┆ 4 ┆ 2 │ - │ 2 ┆ 6 ┆ 2 │ - │ null ┆ 6 ┆ 2 │ - └──────┴─────┴─────┘ - >>> df.select(pl.all().backward_fill(limit=1)) - shape: (3, 3) - ┌──────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞══════╪═════╪══════╡ - │ 1 ┆ 4 ┆ null │ - │ 2 ┆ 6 ┆ 2 │ - │ null ┆ 6 ┆ 2 │ - └──────┴─────┴──────┘ - - ''' - def reverse(self) -> Self: - ''' - Reverse the selection. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4, 5], - ... "fruits": ["banana", "banana", "apple", "apple", "banana"], - ... "B": [5, 4, 3, 2, 1], - ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], - ... } - ... ) - >>> df.select( - ... [ - ... pl.all(), - ... pl.all().reverse().name.suffix("_reverse"), - ... ] - ... ) - shape: (5, 8) - ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ - │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ - │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ - │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ - │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ - │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ - │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ - └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ - - ''' - def std(self, ddof: int = ...) -> Self: - ''' - Get standard deviation. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").std()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def var(self, ddof: int = ...) -> Self: - ''' - Get variance. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").var()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def max(self) -> Self: - ''' - Get maximum value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) - >>> df.select(pl.col("a").max()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def min(self) -> Self: - ''' - Get minimum value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) - >>> df.select(pl.col("a").min()) - shape: (1, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ -1.0 │ - └──────┘ - - ''' - def nan_max(self) -> Self: - ''' - Get maximum value, but propagate/poison encountered NaN values. - - This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0, float("nan")]}) - >>> df.select(pl.col("a").nan_max()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ NaN │ - └─────┘ - - ''' - def nan_min(self) -> Self: - ''' - Get minimum value, but propagate/poison encountered NaN values. - - This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0, float("nan")]}) - >>> df.select(pl.col("a").nan_min()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ NaN │ - └─────┘ - - ''' - def sum(self) -> Self: - ''' - Get sum value. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").sum()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 0 │ - └─────┘ - - ''' - def mean(self) -> Self: - ''' - Get mean value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").mean()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def median(self) -> Self: - ''' - Get median value using linear interpolation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").median()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def product(self) -> Self: - ''' - Compute the product of an expression. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").product()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 6 │ - └─────┘ - - ''' - def n_unique(self) -> Self: - ''' - Count unique values. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").n_unique()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def approx_n_unique(self) -> Self: - ''' - Approximate count of unique values. - - This is done using the HyperLogLog++ algorithm for cardinality estimation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").approx_n_unique()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def null_count(self) -> Self: - ''' - Count null values. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [None, 1, None], - ... "b": [1, 2, 3], - ... } - ... ) - >>> df.select(pl.all().null_count()) - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 2 ┆ 0 │ - └─────┴─────┘ - - ''' - def arg_unique(self) -> Self: - ''' - Get index of first unique value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [8, 9, 10], - ... "b": [None, 4, 4], - ... } - ... ) - >>> df.select(pl.col("a").arg_unique()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 0 │ - │ 1 │ - │ 2 │ - └─────┘ - >>> df.select(pl.col("b").arg_unique()) - shape: (2, 1) - ┌─────┐ - │ b │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 0 │ - │ 1 │ - └─────┘ - - ''' - def unique(self) -> Self: - ''' - Get unique values of this expression. - - Parameters - ---------- - maintain_order - Maintain order of data. This requires more work. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - │ 1 │ - └─────┘ - >>> df.select(pl.col("a").unique(maintain_order=True)) - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - └─────┘ - - ''' - def first(self) -> Self: - ''' - Get the first value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").first()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - └─────┘ - - ''' - def last(self) -> Self: - ''' - Get the last value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").last()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: - ''' - Compute expressions over the given groups. - - This expression is similar to performing a group by aggregation and joining the - result back into the original DataFrame. - - The outcome is similar to how `window functions - `_ - work in PostgreSQL. - - Parameters - ---------- - expr - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_exprs - Additional columns to group by, specified as positional arguments. - mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} - - group_to_rows - If the aggregation results in multiple values, assign them back to their - position in the DataFrame. This can only be done if the group yields - the same elements before aggregation as after. - - join - Join the groups as \'List\' to the row positions. - warning: this can be memory intensive. - - explode - Don\'t do any mapping, but simply flatten the group. - This only makes sense if the input data is sorted. - - Examples - -------- - Pass the name of a column to compute the expression over that column. - - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "a", "b", "b", "b"], - ... "b": [1, 2, 3, 5, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> df.with_columns( - ... pl.col("c").max().over("a").name.suffix("_max"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_max │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 5 │ - │ b ┆ 3 ┆ 3 ┆ 3 │ - │ b ┆ 5 ┆ 2 ┆ 3 │ - │ b ┆ 3 ┆ 1 ┆ 3 │ - └─────┴─────┴─────┴───────┘ - - Expression input is supported. - - >>> df.with_columns( - ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_max │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 4 │ - │ b ┆ 3 ┆ 3 ┆ 4 │ - │ b ┆ 5 ┆ 2 ┆ 2 │ - │ b ┆ 3 ┆ 1 ┆ 4 │ - └─────┴─────┴─────┴───────┘ - - Group by multiple columns by passing a list of column names or expressions. - - >>> df.with_columns( - ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_min │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 4 │ - │ b ┆ 3 ┆ 3 ┆ 1 │ - │ b ┆ 5 ┆ 2 ┆ 2 │ - │ b ┆ 3 ┆ 1 ┆ 1 │ - └─────┴─────┴─────┴───────┘ - - Or use positional arguments to group by multiple columns in the same way. - - >>> df.with_columns( - ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_min │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 4 │ - │ b ┆ 3 ┆ 3 ┆ 1 │ - │ b ┆ 5 ┆ 2 ┆ 1 │ - │ b ┆ 3 ┆ 1 ┆ 1 │ - └─────┴─────┴─────┴───────┘ - - ''' - def rolling(self, index_column: str) -> Self: - ''' - Create rolling groups based on a time, Int32, or Int64 column. - - If you have a time series ``, then by default the - windows created will be - - * (t_0 - period, t_0] - * (t_1 - period, t_1] - * ... - * (t_n - period, t_n] - - whereas if you pass a non-default `offset`, then the windows will be - - * (t_0 + offset, t_0 + offset + period] - * (t_1 + offset, t_1 + offset + period] - * ... - * (t_n + offset, t_n + offset + period] - - The `period` and `offset` arguments are created either from a timedelta, or - by using the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a rolling operation on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order. - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Examples - -------- - >>> dates = [ - ... "2020-01-01 13:45:48", - ... "2020-01-01 16:42:13", - ... "2020-01-01 16:45:09", - ... "2020-01-02 18:12:48", - ... "2020-01-03 19:45:32", - ... "2020-01-08 23:16:43", - ... ] - >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( - ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() - ... ) - >>> df.with_columns( - ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), - ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), - ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), - ... ) - shape: (6, 5) - ┌─────────────────────┬─────┬───────┬───────┬───────┐ - │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ - │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ - │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ - │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ - │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ - │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ - │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ - └─────────────────────┴─────┴───────┴───────┴───────┘ - - ''' - def is_unique(self) -> Self: - ''' - Get mask of unique values. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").is_unique()) - shape: (3, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ false │ - │ true │ - └───────┘ - - ''' - def is_first_distinct(self) -> Self: - ''' - Return a boolean mask indicating the first occurrence of each distinct value. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) - >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) - shape: (5, 2) - ┌─────┬───────┐ - │ a ┆ first │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪═══════╡ - │ 1 ┆ true │ - │ 1 ┆ false │ - │ 2 ┆ true │ - │ 3 ┆ true │ - │ 2 ┆ false │ - └─────┴───────┘ - - ''' - def is_last_distinct(self) -> Self: - ''' - Return a boolean mask indicating the last occurrence of each distinct value. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) - >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) - shape: (5, 2) - ┌─────┬───────┐ - │ a ┆ last │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪═══════╡ - │ 1 ┆ false │ - │ 1 ┆ true │ - │ 2 ┆ false │ - │ 3 ┆ true │ - │ 2 ┆ true │ - └─────┴───────┘ - - ''' - def is_duplicated(self) -> Self: - ''' - Return a boolean mask indicating duplicated values. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").is_duplicated()) - shape: (3, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ true │ - │ true │ - │ false │ - └───────┘ - - ''' - def peak_max(self) -> Self: - ''' - Get a boolean mask of the local maximum peaks. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) - >>> df.select(pl.col("a").peak_max()) - shape: (5, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ false │ - │ false │ - │ false │ - │ true │ - └───────┘ - - ''' - def peak_min(self) -> Self: - ''' - Get a boolean mask of the local minimum peaks. - - Examples - -------- - >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) - >>> df.select(pl.col("a").peak_min()) - shape: (5, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ true │ - │ false │ - │ true │ - │ false │ - └───────┘ - - ''' - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: - ''' - Get quantile value. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) - >>> df.select(pl.col("a").quantile(0.3)) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 2.0 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.5 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.5 │ - └─────┘ - - ''' - def cut(self, breaks: Sequence[float]) -> Self: - ''' - Bin continuous values into discrete categories. - - Parameters - ---------- - breaks - List of unique cut points. - labels - Names of the categories. The number of labels must be equal to the number - of cut points plus one. - left_closed - Set the intervals to be left-closed instead of right-closed. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - - Returns - ------- - Expr - Expression of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise an expression of data type :class:`Struct`. - - See Also - -------- - qcut - - Examples - -------- - Divide a column into three categories. - - >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) - >>> df.with_columns( - ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") - ... ) - shape: (5, 2) - ┌─────┬─────┐ - │ foo ┆ cut │ - │ --- ┆ --- │ - │ i64 ┆ cat │ - ╞═════╪═════╡ - │ -2 ┆ a │ - │ -1 ┆ a │ - │ 0 ┆ b │ - │ 1 ┆ b │ - │ 2 ┆ c │ - └─────┴─────┘ - - Add both the category and the breakpoint. - - >>> df.with_columns( - ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") - ... ).unnest("cut") - shape: (5, 3) - ┌─────┬──────┬────────────┐ - │ foo ┆ brk ┆ foo_bin │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪══════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴──────┴────────────┘ - - ''' - def qcut(self, quantiles: Sequence[float] | int) -> Self: - ''' - Bin continuous values into discrete categories based on their quantiles. - - Parameters - ---------- - quantiles - Either a list of quantile probabilities between 0 and 1 or a positive - integer determining the number of bins with uniform probability. - labels - Names of the categories. The number of labels must be equal to the number - of categories. - left_closed - Set the intervals to be left-closed instead of right-closed. - allow_duplicates - If set to `True`, duplicates in the resulting quantiles are dropped, - rather than raising a `DuplicateError`. This can happen even with unique - probabilities, depending on the data. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - - Returns - ------- - Expr - Expression of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise an expression of data type :class:`Struct`. - - See Also - -------- - cut - - Examples - -------- - Divide a column into three categories according to pre-defined quantile - probabilities. - - >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) - >>> df.with_columns( - ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") - ... ) - shape: (5, 2) - ┌─────┬──────┐ - │ foo ┆ qcut │ - │ --- ┆ --- │ - │ i64 ┆ cat │ - ╞═════╪══════╡ - │ -2 ┆ a │ - │ -1 ┆ a │ - │ 0 ┆ b │ - │ 1 ┆ b │ - │ 2 ┆ c │ - └─────┴──────┘ - - Divide a column into two categories using uniform quantile probabilities. - - >>> df.with_columns( - ... pl.col("foo") - ... .qcut(2, labels=["low", "high"], left_closed=True) - ... .alias("qcut") - ... ) - shape: (5, 2) - ┌─────┬──────┐ - │ foo ┆ qcut │ - │ --- ┆ --- │ - │ i64 ┆ cat │ - ╞═════╪══════╡ - │ -2 ┆ low │ - │ -1 ┆ low │ - │ 0 ┆ high │ - │ 1 ┆ high │ - │ 2 ┆ high │ - └─────┴──────┘ - - Add both the category and the breakpoint. - - >>> df.with_columns( - ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") - ... ).unnest("qcut") - shape: (5, 3) - ┌─────┬──────┬────────────┐ - │ foo ┆ brk ┆ foo_bin │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪══════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴──────┴────────────┘ - - ''' - def rle(self) -> Self: - ''' - Get the lengths of runs of identical values. - - Returns - ------- - Expr - Expression of data type :class:`Struct` with Fields "lengths" and "values". - - Examples - -------- - >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) - >>> df.select(pl.col("s").rle()).unnest("s") - shape: (6, 2) - ┌─────────┬────────┐ - │ lengths ┆ values │ - │ --- ┆ --- │ - │ i32 ┆ i64 │ - ╞═════════╪════════╡ - │ 2 ┆ 1 │ - │ 1 ┆ 2 │ - │ 1 ┆ 1 │ - │ 1 ┆ null │ - │ 1 ┆ 1 │ - │ 2 ┆ 3 │ - └─────────┴────────┘ - ''' - def rle_id(self) -> Self: - ''' - Map values to run IDs. - - Similar to RLE, but it maps each value to an ID corresponding to the run into - which it falls. This is especially useful when you want to define groups by - runs of identical values rather than the values themselves. - - - Examples - -------- - >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) - >>> # It works on structs of multiple values too! - >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) - shape: (5, 4) - ┌─────┬──────┬─────┬──────┐ - │ a ┆ b ┆ a_r ┆ ab_r │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ u32 ┆ u32 │ - ╞═════╪══════╪═════╪══════╡ - │ 1 ┆ x ┆ 0 ┆ 0 │ - │ 2 ┆ x ┆ 1 ┆ 1 │ - │ 1 ┆ null ┆ 2 ┆ 2 │ - │ 1 ┆ y ┆ 2 ┆ 3 │ - │ 1 ┆ y ┆ 2 ┆ 3 │ - └─────┴──────┴─────┴──────┘ - ''' - def filter(self, predicate: Expr) -> Self: - ''' - Filter a single column. - - The original order of the remaining elements is preserved. - - Mostly useful in an aggregation context. If you want to filter on a DataFrame - level, use `LazyFrame.filter`. - - Parameters - ---------- - predicate - Boolean expression. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group_col": ["g1", "g1", "g2"], - ... "b": [1, 2, 3], - ... } - ... ) - >>> df.group_by("group_col").agg( - ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), - ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), - ... ).sort("group_col") - shape: (2, 3) - ┌───────────┬─────┬─────┐ - │ group_col ┆ lt ┆ gte │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═══════════╪═════╪═════╡ - │ g1 ┆ 1 ┆ 2 │ - │ g2 ┆ 0 ┆ 3 │ - └───────────┴─────┴─────┘ - - ''' - def where(self, predicate: Expr) -> Self: - ''' - Filter a single column. - - Alias for :func:`filter`. - - Parameters - ---------- - predicate - Boolean expression. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group_col": ["g1", "g1", "g2"], - ... "b": [1, 2, 3], - ... } - ... ) - >>> df.group_by("group_col").agg( - ... [ - ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), - ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), - ... ] - ... ).sort("group_col") - shape: (2, 3) - ┌───────────┬─────┬─────┐ - │ group_col ┆ lt ┆ gte │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═══════════╪═════╪═════╡ - │ g1 ┆ 1 ┆ 2 │ - │ g2 ┆ 0 ┆ 3 │ - └───────────┴─────┴─────┘ - - ''' - def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: - ''' - Apply a custom python function to a whole Series or sequence of Series. - - The output of this custom function must be a Series. If you want to apply a - custom function elementwise over single values, see :func:`map_elements`. - A reasonable use case for `map` functions is transforming the values - represented by an expression using a third-party library. - - Read more in `the book - `_. - - Parameters - ---------- - function - Lambda/function to apply. - return_dtype - Dtype of the output Series. - agg_list - Aggregate list. - - Notes - ----- - If you are looking to map a function over a window function or group_by context, - refer to func:`map_elements` instead. - - Warnings - -------- - If `return_dtype` is not provided, this may lead to unexpected results. - We allow this, but it is considered a bug in the user\'s query. - - See Also - -------- - map_elements - replace - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "sine": [0.0, 1.0, 0.0, -1.0], - ... "cosine": [1.0, 0.0, -1.0, 0.0], - ... } - ... ) - >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) - shape: (1, 2) - ┌──────┬────────┐ - │ sine ┆ cosine │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪════════╡ - │ 1 ┆ 0 │ - └──────┴────────┘ - - ''' - def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - ''' - Map a custom/user-defined function (UDF) to each element of a column. - - .. warning:: - This method is much slower than the native expressions API. - Only use it if you cannot implement your logic otherwise. - - The UDF is applied to each element of a column. Note that, in a GroupBy - context, the column will have been pre-aggregated and so each element - will itself be a Series. Therefore, depending on the context, - requirements for `function` differ: - - * Selection - Expects `function` to be of type `Callable[[Any], Any]`. - Applies a Python function to each individual value in the column. - * GroupBy - Expects `function` to be of type `Callable[[Series], Any]`. - For each group, applies a Python function to the slice of the column - corresponding to that group. - - Parameters - ---------- - function - Lambda/function to map. - return_dtype - Dtype of the output Series. - If not set, the dtype will be `pl.Unknown`. - skip_nulls - Don\'t map the function over values that contain nulls (this is faster). - pass_name - Pass the Series name to the custom function (this is more expensive). - strategy : {\'thread_local\', \'threading\'} - This functionality is considered experimental and may be removed/changed. - - - \'thread_local\': run the python function on a single thread. - - \'threading\': run the python function on separate threads. Use with - care as this can slow performance. This might only speed up - your code if the amount of work per element is significant - and the python function releases the GIL (e.g. via calling - a c function) - - Notes - ----- - * Using `map_elements` is strongly discouraged as you will be effectively - running python "for" loops, which will be very slow. Wherever possible you - should prefer the native expression API to achieve the best performance. - - * If your function is expensive and you don\'t want it to be called more than - once for a given input, consider applying an `@lru_cache` decorator to it. - If your data is suitable you may achieve *significant* speedups. - - * Window function application using `over` is considered a GroupBy context - here, so `map_elements` can be used to map functions over window groups. - - Warnings - -------- - If `return_dtype` is not provided, this may lead to unexpected results. - We allow this, but it is considered a bug in the user\'s query. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["a", "b", "c", "c"], - ... } - ... ) - - The function is applied to each element of column `\'a\'`: - - >>> df.with_columns( # doctest: +SKIP - ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), - ... ) - shape: (4, 3) - ┌─────┬─────┬───────────┐ - │ a ┆ b ┆ a_times_2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 │ - ╞═════╪═════╪═══════════╡ - │ 1 ┆ a ┆ 2 │ - │ 2 ┆ b ┆ 4 │ - │ 3 ┆ c ┆ 6 │ - │ 1 ┆ c ┆ 2 │ - └─────┴─────┴───────────┘ - - Tip: it is better to implement this with an expression: - - >>> df.with_columns( - ... (pl.col("a") * 2).alias("a_times_2"), - ... ) # doctest: +IGNORE_RESULT - - In a GroupBy context, each element of the column is itself a Series: - - >>> ( - ... df.lazy().group_by("b").agg(pl.col("a")).collect() - ... ) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬───────────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════╪═══════════╡ - │ a ┆ [1] │ - │ b ┆ [2] │ - │ c ┆ [3, 1] │ - └─────┴───────────┘ - - Therefore, from the user\'s point-of-view, the function is applied per-group: - - >>> ( - ... df.lazy() - ... .group_by("b") - ... .agg(pl.col("a").map_elements(lambda x: x.sum())) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬─────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 1 │ - │ b ┆ 2 │ - │ c ┆ 4 │ - └─────┴─────┘ - - Tip: again, it is better to implement this with an expression: - - >>> ( - ... df.lazy() - ... .group_by("b", maintain_order=True) - ... .agg(pl.col("a").sum()) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - - Window function application using `over` will behave as a GroupBy - context, with your function receiving individual window groups: - - >>> df = pl.DataFrame( - ... { - ... "key": ["x", "x", "y", "x", "y", "z"], - ... "val": [1, 1, 1, 1, 1, 1], - ... } - ... ) - >>> df.with_columns( - ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), - ... ).sort("key") - shape: (6, 3) - ┌─────┬─────┬────────┐ - │ key ┆ val ┆ scaled │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪════════╡ - │ x ┆ 1 ┆ 3 │ - │ x ┆ 1 ┆ 3 │ - │ x ┆ 1 ┆ 3 │ - │ y ┆ 1 ┆ 2 │ - │ y ┆ 1 ┆ 2 │ - │ z ┆ 1 ┆ 1 │ - └─────┴─────┴────────┘ - - Note that this function would *also* be better-implemented natively: - - >>> df.with_columns( - ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), - ... ).sort( - ... "key" - ... ) # doctest: +IGNORE_RESULT - - ''' - def flatten(self) -> Self: - ''' - Flatten a list or string column. - - Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": ["a", "b", "b"], - ... "values": [[1, 2], [2, 3], [4]], - ... } - ... ) - >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ values │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪═══════════╡ - │ a ┆ [1, 2] │ - │ b ┆ [2, 3, 4] │ - └───────┴───────────┘ - - ''' - def explode(self) -> Self: - ''' - Explode a list expression. - - This means that every item is expanded to a new row. - - Returns - ------- - Expr - Expression with the data type of the list elements. - - See Also - -------- - Expr.list.explode : Explode a list column. - Expr.str.explode : Explode a string column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": ["a", "b"], - ... "values": [ - ... [1, 2], - ... [3, 4], - ... ], - ... } - ... ) - >>> df.select(pl.col("values").explode()) - shape: (4, 1) - ┌────────┐ - │ values │ - │ --- │ - │ i64 │ - ╞════════╡ - │ 1 │ - │ 2 │ - │ 3 │ - │ 4 │ - └────────┘ - - ''' - def implode(self) -> Self: - ''' - Aggregate values into a list. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": [4, 5, 6], - ... } - ... ) - >>> df.select(pl.all().implode()) - shape: (1, 2) - ┌───────────┬───────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ list[i64] ┆ list[i64] │ - ╞═══════════╪═══════════╡ - │ [1, 2, 3] ┆ [4, 5, 6] │ - └───────────┴───────────┘ - - ''' - def gather_every(self, n: int) -> Self: - ''' - Take every nth value in the Series and return as a new Series. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) - >>> df.select(pl.col("foo").gather_every(3)) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 4 │ - │ 7 │ - └─────┘ - - ''' - def head(self, n: int | Expr = ...) -> Self: - ''' - Get the first `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.head(3) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - ''' - def tail(self, n: int | Expr = ...) -> Self: - ''' - Get the last `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.tail(3) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 5 │ - │ 6 │ - │ 7 │ - └─────┘ - - ''' - def limit(self, n: int | Expr = ...) -> Self: - ''' - Get the first `n` rows (alias for :func:`Expr.head`). - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.limit(3) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - ''' - def and_(self, *others: Any) -> Self: - ''' - Method equivalent of bitwise "and" operator `expr & other & ...`. - - Parameters - ---------- - *others - One or more integer or boolean expressions to evaluate/combine. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5, 6, 7, 4, 8], - ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], - ... "z": [-9, 2, -1, 4, 8], - ... } - ... ) - >>> df.select( - ... (pl.col("x") >= pl.col("z")) - ... .and_( - ... pl.col("y") >= pl.col("z"), - ... pl.col("y") == pl.col("y"), - ... pl.col("z") <= pl.col("x"), - ... pl.col("y") != pl.col("x"), - ... ) - ... .alias("all") - ... ) - shape: (5, 1) - ┌───────┐ - │ all │ - │ --- │ - │ bool │ - ╞═══════╡ - │ true │ - │ true │ - │ true │ - │ false │ - │ false │ - └───────┘ - - ''' - def or_(self, *others: Any) -> Self: - ''' - Method equivalent of bitwise "or" operator `expr | other | ...`. - - Parameters - ---------- - *others - One or more integer or boolean expressions to evaluate/combine. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5, 6, 7, 4, 8], - ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], - ... "z": [-9, 2, -1, 4, 8], - ... } - ... ) - >>> df.select( - ... (pl.col("x") == pl.col("y")) - ... .or_( - ... pl.col("x") == pl.col("y"), - ... pl.col("y") == pl.col("z"), - ... pl.col("y").cast(int) == pl.col("z"), - ... ) - ... .alias("any") - ... ) - shape: (5, 1) - ┌───────┐ - │ any │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ true │ - │ false │ - │ true │ - │ false │ - └───────┘ - - ''' - def eq(self, other: Any) -> Self: - ''' - Method equivalent of equality operator `expr == other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0], - ... "y": [2.0, 2.0, float("nan"), 4.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").eq(pl.col("y")).alias("x == y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x == y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 1.0 ┆ 2.0 ┆ false │ - │ 2.0 ┆ 2.0 ┆ true │ - │ NaN ┆ NaN ┆ false │ - │ 4.0 ┆ 4.0 ┆ true │ - └─────┴─────┴────────┘ - - ''' - def eq_missing(self, other: Any) -> Self: - ''' - Method equivalent of equality operator `expr == other` where `None == None`. - - This differs from default `eq` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], - ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").eq(pl.col("y")).alias("x eq y"), - ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), - ... ) - shape: (6, 4) - ┌──────┬──────┬────────┬────────────────┐ - │ x ┆ y ┆ x eq y ┆ x eq_missing y │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪══════╪════════╪════════════════╡ - │ 1.0 ┆ 2.0 ┆ false ┆ false │ - │ 2.0 ┆ 2.0 ┆ true ┆ true │ - │ NaN ┆ NaN ┆ false ┆ false │ - │ 4.0 ┆ 4.0 ┆ true ┆ true │ - │ null ┆ 5.0 ┆ null ┆ false │ - │ null ┆ null ┆ null ┆ true │ - └──────┴──────┴────────┴────────────────┘ - - ''' - def ge(self, other: Any) -> Self: - ''' - Method equivalent of "greater than or equal" operator `expr >= other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5.0, 4.0, float("nan"), 2.0], - ... "y": [5.0, 3.0, float("nan"), 1.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").ge(pl.col("y")).alias("x >= y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x >= y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 5.0 ┆ 5.0 ┆ true │ - │ 4.0 ┆ 3.0 ┆ true │ - │ NaN ┆ NaN ┆ false │ - │ 2.0 ┆ 1.0 ┆ true │ - └─────┴─────┴────────┘ - - ''' - def gt(self, other: Any) -> Self: - ''' - Method equivalent of "greater than" operator `expr > other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5.0, 4.0, float("nan"), 2.0], - ... "y": [5.0, 3.0, float("nan"), 1.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").gt(pl.col("y")).alias("x > y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬───────┐ - │ x ┆ y ┆ x > y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪═══════╡ - │ 5.0 ┆ 5.0 ┆ false │ - │ 4.0 ┆ 3.0 ┆ true │ - │ NaN ┆ NaN ┆ false │ - │ 2.0 ┆ 1.0 ┆ true │ - └─────┴─────┴───────┘ - - ''' - def le(self, other: Any) -> Self: - ''' - Method equivalent of "less than or equal" operator `expr <= other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5.0, 4.0, float("nan"), 0.5], - ... "y": [5.0, 3.5, float("nan"), 2.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").le(pl.col("y")).alias("x <= y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x <= y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 5.0 ┆ 5.0 ┆ true │ - │ 4.0 ┆ 3.5 ┆ false │ - │ NaN ┆ NaN ┆ false │ - │ 0.5 ┆ 2.0 ┆ true │ - └─────┴─────┴────────┘ - - ''' - def lt(self, other: Any) -> Self: - ''' - Method equivalent of "less than" operator `expr < other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 3.0], - ... "y": [2.0, 2.0, float("nan"), 4.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").lt(pl.col("y")).alias("x < y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬───────┐ - │ x ┆ y ┆ x < y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪═══════╡ - │ 1.0 ┆ 2.0 ┆ true │ - │ 2.0 ┆ 2.0 ┆ false │ - │ NaN ┆ NaN ┆ false │ - │ 3.0 ┆ 4.0 ┆ true │ - └─────┴─────┴───────┘ - - ''' - def ne(self, other: Any) -> Self: - ''' - Method equivalent of inequality operator `expr != other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0], - ... "y": [2.0, 2.0, float("nan"), 4.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").ne(pl.col("y")).alias("x != y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x != y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 1.0 ┆ 2.0 ┆ true │ - │ 2.0 ┆ 2.0 ┆ false │ - │ NaN ┆ NaN ┆ true │ - │ 4.0 ┆ 4.0 ┆ false │ - └─────┴─────┴────────┘ - - ''' - def ne_missing(self, other: Any) -> Self: - ''' - Method equivalent of equality operator `expr != other` where `None == None`. - - This differs from default `ne` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], - ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").ne(pl.col("y")).alias("x ne y"), - ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), - ... ) - shape: (6, 4) - ┌──────┬──────┬────────┬────────────────┐ - │ x ┆ y ┆ x ne y ┆ x ne_missing y │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪══════╪════════╪════════════════╡ - │ 1.0 ┆ 2.0 ┆ true ┆ true │ - │ 2.0 ┆ 2.0 ┆ false ┆ false │ - │ NaN ┆ NaN ┆ true ┆ true │ - │ 4.0 ┆ 4.0 ┆ false ┆ false │ - │ null ┆ 5.0 ┆ null ┆ true │ - │ null ┆ null ┆ null ┆ false │ - └──────┴──────┴────────┴────────────────┘ - - ''' - def add(self, other: Any) -> Self: - ''' - Method equivalent of addition operator `expr + other`. - - Parameters - ---------- - other - numeric or string value; accepts expression input. - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) - >>> df.with_columns( - ... pl.col("x").add(2).alias("x+int"), - ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), - ... ) - shape: (5, 3) - ┌─────┬───────┬────────┐ - │ x ┆ x+int ┆ x+expr │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═══════╪════════╡ - │ 1 ┆ 3 ┆ 2 │ - │ 2 ┆ 4 ┆ 4 │ - │ 3 ┆ 5 ┆ 9 │ - │ 4 ┆ 6 ┆ 28 │ - │ 5 ┆ 7 ┆ 125 │ - └─────┴───────┴────────┘ - - >>> df = pl.DataFrame( - ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} - ... ) - >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) - shape: (3, 4) - ┌─────┬─────┬─────┬─────┐ - │ x ┆ y ┆ z ┆ xyz │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ str ┆ str │ - ╞═════╪═════╪═════╪═════╡ - │ a ┆ b ┆ c ┆ abc │ - │ d ┆ e ┆ f ┆ def │ - │ g ┆ h ┆ i ┆ ghi │ - └─────┴─────┴─────┴─────┘ - - ''' - def floordiv(self, other: Any) -> Self: - ''' - Method equivalent of integer division operator `expr // other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - See Also - -------- - truediv - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) - >>> df.with_columns( - ... pl.col("x").truediv(2).alias("x/2"), - ... pl.col("x").floordiv(2).alias("x//2"), - ... ) - shape: (5, 3) - ┌─────┬─────┬──────┐ - │ x ┆ x/2 ┆ x//2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ i64 │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 0.5 ┆ 0 │ - │ 2 ┆ 1.0 ┆ 1 │ - │ 3 ┆ 1.5 ┆ 1 │ - │ 4 ┆ 2.0 ┆ 2 │ - │ 5 ┆ 2.5 ┆ 2 │ - └─────┴─────┴──────┘ - - ''' - def mod(self, other: Any) -> Self: - ''' - Method equivalent of modulus operator `expr % other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) - >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) - shape: (5, 2) - ┌─────┬─────┐ - │ x ┆ x%2 │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 0 ┆ 0 │ - │ 1 ┆ 1 │ - │ 2 ┆ 0 │ - │ 3 ┆ 1 │ - │ 4 ┆ 0 │ - └─────┴─────┘ - - ''' - def mul(self, other: Any) -> Self: - ''' - Method equivalent of multiplication operator `expr * other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) - >>> df.with_columns( - ... pl.col("x").mul(2).alias("x*2"), - ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), - ... ) - shape: (5, 3) - ┌─────┬─────┬───────────┐ - │ x ┆ x*2 ┆ x * xlog2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ f64 │ - ╞═════╪═════╪═══════════╡ - │ 1 ┆ 2 ┆ 0.0 │ - │ 2 ┆ 4 ┆ 2.0 │ - │ 4 ┆ 8 ┆ 8.0 │ - │ 8 ┆ 16 ┆ 24.0 │ - │ 16 ┆ 32 ┆ 64.0 │ - └─────┴─────┴───────────┘ - - ''' - def sub(self, other: Any) -> Self: - ''' - Method equivalent of subtraction operator `expr - other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("x").sub(2).alias("x-2"), - ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), - ... ) - shape: (5, 3) - ┌─────┬─────┬────────┐ - │ x ┆ x-2 ┆ x-expr │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪════════╡ - │ 0 ┆ -2 ┆ 0 │ - │ 1 ┆ -1 ┆ 0 │ - │ 2 ┆ 0 ┆ -1 │ - │ 3 ┆ 1 ┆ -3 │ - │ 4 ┆ 2 ┆ -6 │ - └─────┴─────┴────────┘ - - ''' - def truediv(self, other: Any) -> Self: - ''' - Method equivalent of float division operator `expr / other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Notes - ----- - Zero-division behaviour follows IEEE-754: - - 0/0: Invalid operation - mathematically undefined, returns NaN. - n/0: On finite operands gives an exact infinite result, eg: ±infinity. - - See Also - -------- - floordiv - - Examples - -------- - >>> df = pl.DataFrame( - ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} - ... ) - >>> df.with_columns( - ... pl.col("x").truediv(2).alias("x/2"), - ... pl.col("x").truediv(pl.col("y")).alias("x/y"), - ... ) - shape: (5, 4) - ┌─────┬──────┬──────┬───────┐ - │ x ┆ y ┆ x/2 ┆ x/y │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ f64 ┆ f64 │ - ╞═════╪══════╪══════╪═══════╡ - │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ - │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ - │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ - │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ - │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ - └─────┴──────┴──────┴───────┘ - - ''' - def pow(self, exponent: int | float | None | Series | Expr) -> Self: - ''' - Method equivalent of exponentiation operator `expr ** exponent`. - - Parameters - ---------- - exponent - Numeric literal or expression exponent value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) - >>> df.with_columns( - ... pl.col("x").pow(3).alias("cube"), - ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), - ... ) - shape: (4, 3) - ┌─────┬───────┬────────────┐ - │ x ┆ cube ┆ x ** xlog2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ f64 │ - ╞═════╪═══════╪════════════╡ - │ 1 ┆ 1.0 ┆ 1.0 │ - │ 2 ┆ 8.0 ┆ 2.0 │ - │ 4 ┆ 64.0 ┆ 16.0 │ - │ 8 ┆ 512.0 ┆ 512.0 │ - └─────┴───────┴────────────┘ - - ''' - def xor(self, other: Any) -> Self: - ''' - Method equivalent of bitwise exclusive-or operator `expr ^ other`. - - Parameters - ---------- - other - Integer or boolean value; accepts expression input. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"x": [True, False, True, False], "y": [True, True, False, False]} - ... ) - >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) - shape: (4, 3) - ┌───────┬───────┬───────┐ - │ x ┆ y ┆ x ^ y │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞═══════╪═══════╪═══════╡ - │ true ┆ true ┆ false │ - │ false ┆ true ┆ true │ - │ true ┆ false ┆ true │ - │ false ┆ false ┆ false │ - └───────┴───────┴───────┘ - - >>> def binary_string(n: int) -> str: - ... return bin(n)[2:].zfill(8) - >>> - >>> df = pl.DataFrame( - ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, - ... schema={"x": pl.UInt8, "y": pl.UInt8}, - ... ) - >>> df.with_columns( - ... pl.col("x").map_elements(binary_string).alias("bin_x"), - ... pl.col("y").map_elements(binary_string).alias("bin_y"), - ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), - ... pl.col("x") - ... .xor(pl.col("y")) - ... .map_elements(binary_string) - ... .alias("bin_xor_xy"), - ... ) - shape: (4, 6) - ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ - │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ - ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ - │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ - │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ - │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ - │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ - └─────┴─────┴──────────┴──────────┴────────┴────────────┘ - - ''' - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: - ''' - Check if elements of this expression are present in the other Series. - - Parameters - ---------- - other - Series or sequence of primitive type. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} - ... ) - >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) - shape: (3, 3) - ┌───────────┬──────────────────┬──────────┐ - │ sets ┆ optional_members ┆ contains │ - │ --- ┆ --- ┆ --- │ - │ list[i64] ┆ i64 ┆ bool │ - ╞═══════════╪══════════════════╪══════════╡ - │ [1, 2, 3] ┆ 1 ┆ true │ - │ [1, 2] ┆ 2 ┆ true │ - │ [9, 10] ┆ 3 ┆ false │ - └───────────┴──────────────────┴──────────┘ - - ''' - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: - ''' - Repeat the elements in this Series as specified in the given expression. - - The repeated elements are expanded into a `List`. - - Parameters - ---------- - by - Numeric column that determines how often the values will be repeated. - The column will be coerced to UInt32. Give this dtype to make the coercion a - no-op. - - Returns - ------- - Expr - Expression of data type :class:`List`, where the inner data type is equal - to the original data type. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["x", "y", "z"], - ... "n": [1, 2, 3], - ... } - ... ) - >>> df.select(pl.col("a").repeat_by("n")) - shape: (3, 1) - ┌─────────────────┐ - │ a │ - │ --- │ - │ list[str] │ - ╞═════════════════╡ - │ ["x"] │ - │ ["y", "y"] │ - │ ["z", "z", "z"] │ - └─────────────────┘ - - ''' - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: - ''' - Check if this expression is between the given start and end values. - - Parameters - ---------- - lower_bound - Lower bound value. Accepts expression input. Strings are parsed as column - names, other non-expression inputs are parsed as literals. - upper_bound - Upper bound value. Accepts expression input. Strings are parsed as column - names, other non-expression inputs are parsed as literals. - closed : {\'both\', \'left\', \'right\', \'none\'} - Define which sides of the interval are closed (inclusive). - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) - >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) - shape: (5, 2) - ┌─────┬────────────┐ - │ num ┆ is_between │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪════════════╡ - │ 1 ┆ false │ - │ 2 ┆ true │ - │ 3 ┆ true │ - │ 4 ┆ true │ - │ 5 ┆ false │ - └─────┴────────────┘ - - Use the `closed` argument to include or exclude the values at the bounds: - - >>> df.with_columns( - ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") - ... ) - shape: (5, 2) - ┌─────┬────────────┐ - │ num ┆ is_between │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪════════════╡ - │ 1 ┆ false │ - │ 2 ┆ true │ - │ 3 ┆ true │ - │ 4 ┆ false │ - │ 5 ┆ false │ - └─────┴────────────┘ - - You can also use strings as well as numeric/temporal values (note: ensure that - string literals are wrapped with `lit` so as not to conflate them with - column names): - - >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) - >>> df.with_columns( - ... pl.col("a") - ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") - ... .alias("is_between") - ... ) - shape: (5, 2) - ┌─────┬────────────┐ - │ a ┆ is_between │ - │ --- ┆ --- │ - │ str ┆ bool │ - ╞═════╪════════════╡ - │ a ┆ true │ - │ b ┆ true │ - │ c ┆ true │ - │ d ┆ false │ - │ e ┆ false │ - └─────┴────────────┘ - - ''' - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: - ''' - Hash the elements in the selection. - - The hash value is of type `UInt64`. - - Parameters - ---------- - seed - Random seed parameter. Defaults to 0. - seed_1 - Random seed parameter. Defaults to `seed` if not set. - seed_2 - Random seed parameter. Defaults to `seed` if not set. - seed_3 - Random seed parameter. Defaults to `seed` if not set. - - Notes - ----- - This implementation of :func:`rows` does not guarantee stable results - across different Polars versions. Its stability is only guaranteed within a - single version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": ["x", None, "z"], - ... } - ... ) - >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌──────────────────────┬──────────────────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u64 ┆ u64 │ - ╞══════════════════════╪══════════════════════╡ - │ 9774092659964970114 ┆ 13614470193936745724 │ - │ 1101441246220388612 ┆ 11638928888656214026 │ - │ 11638928888656214026 ┆ 13382926553367784577 │ - └──────────────────────┴──────────────────────┘ - - ''' - def reinterpret(self) -> Self: - ''' - Reinterpret the underlying bits as a signed/unsigned integer. - - This operation is only allowed for 64bit integers. For lower bits integers, - you can safely use that cast operation. - - Parameters - ---------- - signed - If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. - - Examples - -------- - >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) - >>> df = pl.DataFrame([s]) - >>> df.select( - ... [ - ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), - ... pl.col("a").alias("original"), - ... ] - ... ) - shape: (3, 2) - ┌───────────────┬──────────┐ - │ reinterpreted ┆ original │ - │ --- ┆ --- │ - │ i64 ┆ u64 │ - ╞═══════════════╪══════════╡ - │ 1 ┆ 1 │ - │ 1 ┆ 1 │ - │ 2 ┆ 2 │ - └───────────────┴──────────┘ - - ''' - def inspect(self, fmt: str = ...) -> Self: - ''' - Print the value that this expression evaluates to and pass on the value. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 1, 2]}) - >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) - value is: shape: (3,) - Series: \'foo\' [i64] - [ - 1 - 2 - 4 - ] - shape: (3, 1) - ┌─────┐ - │ bar │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 4 │ - └─────┘ - - ''' - def interpolate(self, method: InterpolationMethod = ...) -> Self: - ''' - Fill null values using interpolation. - - Parameters - ---------- - method : {\'linear\', \'nearest\'} - Interpolation method. - - Examples - -------- - Fill null values using linear interpolation. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, None, 3], - ... "b": [1.0, float("nan"), 3.0], - ... } - ... ) - >>> df.select(pl.all().interpolate()) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 1.0 ┆ 1.0 │ - │ 2.0 ┆ NaN │ - │ 3.0 ┆ 3.0 │ - └─────┴─────┘ - - Fill null values using nearest interpolation. - - >>> df.select(pl.all().interpolate("nearest")) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪═════╡ - │ 1 ┆ 1.0 │ - │ 3 ┆ NaN │ - │ 3 ┆ 3.0 │ - └─────┴─────┘ - - Regrid data to a new grid. - - >>> df_original_grid = pl.DataFrame( - ... { - ... "grid_points": [1, 3, 10], - ... "values": [2.0, 6.0, 20.0], - ... } - ... ) # Interpolate from this to the new grid - >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) - >>> df_new_grid.join( - ... df_original_grid, on="grid_points", how="left" - ... ).with_columns(pl.col("values").interpolate()) - shape: (10, 2) - ┌─────────────┬────────┐ - │ grid_points ┆ values │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════════════╪════════╡ - │ 1 ┆ 2.0 │ - │ 2 ┆ 4.0 │ - │ 3 ┆ 6.0 │ - │ 4 ┆ 8.0 │ - │ … ┆ … │ - │ 7 ┆ 14.0 │ - │ 8 ┆ 16.0 │ - │ 9 ┆ 18.0 │ - │ 10 ┆ 20.0 │ - └─────────────┴────────┘ - - ''' - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling min (moving min) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their min. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic - temporal size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_min=pl.col("A").rolling_min(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_min │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 2.0 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ 4.0 │ - │ 6.0 ┆ 5.0 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_min=pl.col("A").rolling_min( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_min │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.25 │ - │ 3.0 ┆ 0.5 │ - │ 4.0 ┆ 0.75 │ - │ 5.0 ┆ 1.0 │ - │ 6.0 ┆ 1.25 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_min │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 2.0 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ 4.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - >>> df_temporal.with_columns( - ... rolling_row_min=pl.col("row_nr").rolling_min( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_min │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling max (moving max) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their max. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_max=pl.col("A").rolling_max(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_max │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 2.0 │ - │ 3.0 ┆ 3.0 │ - │ 4.0 ┆ 4.0 │ - │ 5.0 ┆ 5.0 │ - │ 6.0 ┆ 6.0 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_max=pl.col("A").rolling_max( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_max │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.25 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ 3.75 │ - │ 6.0 ┆ 4.5 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_max │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 3.0 │ - │ 3.0 ┆ 4.0 │ - │ 4.0 ┆ 5.0 │ - │ 5.0 ┆ 6.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling max with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_max=pl.col("row_nr").rolling_max( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_max │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling max with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_max=pl.col("row_nr").rolling_max( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_max │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling mean (moving mean) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their mean. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_mean=pl.col("A").rolling_mean(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬──────────────┐ - │ A ┆ rolling_mean │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.5 │ - │ 4.0 ┆ 3.5 │ - │ 5.0 ┆ 4.5 │ - │ 6.0 ┆ 5.5 │ - └─────┴──────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_mean=pl.col("A").rolling_mean( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────┐ - │ A ┆ rolling_mean │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.75 │ - │ 3.0 ┆ 2.75 │ - │ 4.0 ┆ 3.75 │ - │ 5.0 ┆ 4.75 │ - │ 6.0 ┆ 5.75 │ - └─────┴──────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬──────────────┐ - │ A ┆ rolling_mean │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 2.0 │ - │ 3.0 ┆ 3.0 │ - │ 4.0 ┆ 4.0 │ - │ 5.0 ┆ 5.0 │ - │ 6.0 ┆ null │ - └─────┴──────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling mean with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_mean=pl.col("row_nr").rolling_mean( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬──────────────────┐ - │ row_nr ┆ date ┆ rolling_row_mean │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪══════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ - └────────┴─────────────────────┴──────────────────┘ - - Compute the rolling mean with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_mean=pl.col("row_nr").rolling_mean( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬──────────────────┐ - │ row_nr ┆ date ┆ rolling_row_mean │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪══════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ - └────────┴─────────────────────┴──────────────────┘ - - ''' - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling sum (moving sum) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their sum. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - of dtype `{Date, Datetime}` - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_sum=pl.col("A").rolling_sum(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_sum │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 3.0 │ - │ 3.0 ┆ 5.0 │ - │ 4.0 ┆ 7.0 │ - │ 5.0 ┆ 9.0 │ - │ 6.0 ┆ 11.0 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_sum=pl.col("A").rolling_sum( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_sum │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.75 │ - │ 3.0 ┆ 2.75 │ - │ 4.0 ┆ 3.75 │ - │ 5.0 ┆ 4.75 │ - │ 6.0 ┆ 5.75 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_sum │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 6.0 │ - │ 3.0 ┆ 9.0 │ - │ 4.0 ┆ 12.0 │ - │ 5.0 ┆ 15.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling sum with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_sum=pl.col("row_nr").rolling_sum( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_sum │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling sum with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_sum=pl.col("row_nr").rolling_sum( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_sum │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling standard deviation. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` means - the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_std=pl.col("A").rolling_std(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_std │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.707107 │ - │ 3.0 ┆ 0.707107 │ - │ 4.0 ┆ 0.707107 │ - │ 5.0 ┆ 0.707107 │ - │ 6.0 ┆ 0.707107 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_std=pl.col("A").rolling_std( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_std │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.433013 │ - │ 3.0 ┆ 0.433013 │ - │ 4.0 ┆ 0.433013 │ - │ 5.0 ┆ 0.433013 │ - │ 6.0 ┆ 0.433013 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_std │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 1.0 │ - │ 4.0 ┆ 1.0 │ - │ 5.0 ┆ 1.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling std with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_std=pl.col("row_nr").rolling_std( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_std │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling std with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_std=pl.col("row_nr").rolling_std( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_std │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling variance. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_var=pl.col("A").rolling_var(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_var │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.5 │ - │ 3.0 ┆ 0.5 │ - │ 4.0 ┆ 0.5 │ - │ 5.0 ┆ 0.5 │ - │ 6.0 ┆ 0.5 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_var=pl.col("A").rolling_var( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_var │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.1875 │ - │ 3.0 ┆ 0.1875 │ - │ 4.0 ┆ 0.1875 │ - │ 5.0 ┆ 0.1875 │ - │ 6.0 ┆ 0.1875 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_var │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 1.0 │ - │ 4.0 ┆ 1.0 │ - │ 5.0 ┆ 1.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling var with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_var=pl.col("row_nr").rolling_var( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_var │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling var with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_var=pl.col("row_nr").rolling_var( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_var │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling median. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` means - the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_median=pl.col("A").rolling_median(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬────────────────┐ - │ A ┆ rolling_median │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.5 │ - │ 4.0 ┆ 3.5 │ - │ 5.0 ┆ 4.5 │ - │ 6.0 ┆ 5.5 │ - └─────┴────────────────┘ - - Specify weights for the values in each window: - - >>> df.with_columns( - ... rolling_median=pl.col("A").rolling_median( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬────────────────┐ - │ A ┆ rolling_median │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.5 │ - │ 4.0 ┆ 3.5 │ - │ 5.0 ┆ 4.5 │ - │ 6.0 ┆ 5.5 │ - └─────┴────────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬────────────────┐ - │ A ┆ rolling_median │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 2.0 │ - │ 3.0 ┆ 3.0 │ - │ 4.0 ┆ 4.0 │ - │ 5.0 ┆ 5.0 │ - │ 6.0 ┆ null │ - └─────┴────────────────┘ - - ''' - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling quantile. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - window_size - The length of the window. Can be a fixed integer size, or a dynamic - temporal size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.25, window_size=4 - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ null │ - │ 4.0 ┆ 2.0 │ - │ 5.0 ┆ 3.0 │ - │ 6.0 ┆ 4.0 │ - └─────┴──────────────────┘ - - Specify weights for the values in each window: - - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ null │ - │ 4.0 ┆ 2.0 │ - │ 5.0 ┆ 3.0 │ - │ 6.0 ┆ 4.0 │ - └─────┴──────────────────┘ - - Specify weights and interpolation method - - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.25, - ... window_size=4, - ... weights=[0.2, 0.4, 0.4, 0.2], - ... interpolation="linear", - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ null │ - │ 4.0 ┆ 1.625 │ - │ 5.0 ┆ 2.625 │ - │ 6.0 ┆ 3.625 │ - └─────┴──────────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.2, window_size=5, center=True - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ 2.0 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ null │ - │ 6.0 ┆ null │ - └─────┴──────────────────┘ - - ''' - def rolling_skew(self, window_size: int) -> Self: - ''' - Compute a rolling skew. - - The window at a given row includes the row itself and the - `window_size - 1` elements before it. - - Parameters - ---------- - window_size - Integer size of the rolling window. - bias - If False, the calculations are corrected for statistical bias. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) - >>> df.select(pl.col("a").rolling_skew(3)) - shape: (4, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ null │ - │ null │ - │ 0.381802 │ - │ 0.47033 │ - └──────────┘ - - Note how the values match the following: - - >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() - (0.38180177416060584, 0.47033046033698594) - - ''' - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a custom rolling window function. - - .. warning:: - Computing custom functions is extremely slow. Use specialized rolling - functions such as :func:`Expr.rolling_sum` if at all possible. - - Parameters - ---------- - function - Custom aggregation function. - window_size - Size of the window. The window at a given row will include the row - itself and the `window_size - 1` elements before it. - weights - A list of weights with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window. - - Examples - -------- - >>> from numpy import nansum - >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) - >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) - shape: (5, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ null │ - │ null │ - │ 22.0 │ - │ 11.0 │ - │ 17.0 │ - └──────┘ - - ''' - def abs(self) -> Self: - ''' - Compute absolute values. - - Same as `abs(expr)`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [-1.0, 0.0, 1.0, 2.0], - ... } - ... ) - >>> df.select(pl.col("A").abs()) - shape: (4, 1) - ┌─────┐ - │ A │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - │ 0.0 │ - │ 1.0 │ - │ 2.0 │ - └─────┘ - - ''' - def rank(self, method: RankMethod = ...) -> Self: - ''' - Assign ranks to data, dealing with ties appropriately. - - Parameters - ---------- - method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} - The method used to assign ranks to tied elements. - The following methods are available (default is \'average\'): - - - \'average\' : The average of the ranks that would have been assigned to - all the tied values is assigned to each value. - - \'min\' : The minimum of the ranks that would have been assigned to all - the tied values is assigned to each value. (This is also referred to - as "competition" ranking.) - - \'max\' : The maximum of the ranks that would have been assigned to all - the tied values is assigned to each value. - - \'dense\' : Like \'min\', but the rank of the next highest element is - assigned the rank immediately after those assigned to the tied - elements. - - \'ordinal\' : All values are given a distinct rank, corresponding to - the order that the values occur in the Series. - - \'random\' : Like \'ordinal\', but the rank for ties is not dependent - on the order that the values occur in the Series. - descending - Rank in descending order. - seed - If `method="random"`, use this as seed. - - Examples - -------- - The \'average\' method: - - >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) - >>> df.select(pl.col("a").rank()) - shape: (5, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 3.0 │ - │ 4.5 │ - │ 1.5 │ - │ 1.5 │ - │ 4.5 │ - └─────┘ - - The \'ordinal\' method: - - >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) - >>> df.select(pl.col("a").rank("ordinal")) - shape: (5, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 3 │ - │ 4 │ - │ 1 │ - │ 2 │ - │ 5 │ - └─────┘ - - Use \'rank\' with \'over\' to rank within groups: - - >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) - >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) - shape: (5, 3) - ┌─────┬─────┬──────┐ - │ a ┆ b ┆ rank │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ f64 │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 6 ┆ 1.0 │ - │ 1 ┆ 7 ┆ 2.0 │ - │ 2 ┆ 5 ┆ 1.0 │ - │ 2 ┆ 14 ┆ 3.0 │ - │ 2 ┆ 11 ┆ 2.0 │ - └─────┴─────┴──────┘ - - ''' - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: - ''' - Calculate the first discrete difference between shifted items. - - Parameters - ---------- - n - Number of slots to shift. - null_behavior : {\'ignore\', \'drop\'} - How to handle null values. - - Examples - -------- - >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) - >>> df.with_columns(change=pl.col("int").diff()) - shape: (5, 2) - ┌─────┬────────┐ - │ int ┆ change │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪════════╡ - │ 20 ┆ null │ - │ 10 ┆ -10 │ - │ 30 ┆ 20 │ - │ 25 ┆ -5 │ - │ 35 ┆ 10 │ - └─────┴────────┘ - - >>> df.with_columns(change=pl.col("int").diff(n=2)) - shape: (5, 2) - ┌─────┬────────┐ - │ int ┆ change │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪════════╡ - │ 20 ┆ null │ - │ 10 ┆ null │ - │ 30 ┆ 10 │ - │ 25 ┆ 15 │ - │ 35 ┆ 5 │ - └─────┴────────┘ - - >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) - shape: (3, 1) - ┌──────┐ - │ diff │ - │ --- │ - │ i64 │ - ╞══════╡ - │ 10 │ - │ 15 │ - │ 5 │ - └──────┘ - - ''' - def pct_change(self, n: int | IntoExprColumn = ...) -> Self: - ''' - Computes percentage change between values. - - Percentage change (as fraction) between current element and most-recent - non-null element at least `n` period(s) before the current element. - - Computes the change from the previous row by default. - - Parameters - ---------- - n - periods to shift for forming percent change. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [10, 11, 12, None, 12], - ... } - ... ) - >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) - shape: (5, 2) - ┌──────┬────────────┐ - │ a ┆ pct_change │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞══════╪════════════╡ - │ 10 ┆ null │ - │ 11 ┆ 0.1 │ - │ 12 ┆ 0.090909 │ - │ null ┆ 0.0 │ - │ 12 ┆ 0.0 │ - └──────┴────────────┘ - - ''' - def skew(self) -> Self: - ''' - Compute the sample skewness of a data set. - - For normally distributed data, the skewness should be about zero. For - unimodal continuous distributions, a skewness value greater than zero means - that there is more weight in the right tail of the distribution. The - function `skewtest` can be used to determine if the skewness value - is close enough to zero, statistically speaking. - - - See scipy.stats for more information. - - Parameters - ---------- - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - Notes - ----- - The sample skewness is computed as the Fisher-Pearson coefficient - of skewness, i.e. - - .. math:: g_1=\\frac{m_3}{m_2^{3/2}} - - where - - .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i - - is the biased sample :math:`i\\texttt{th}` central moment, and - :math:`\\bar{x}` is - the sample mean. If `bias` is False, the calculations are - corrected for bias and the value computed is the adjusted - Fisher-Pearson standardized moment coefficient, i.e. - - .. math:: - G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").skew()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.343622 │ - └──────────┘ - - ''' - def kurtosis(self) -> Self: - ''' - Compute the kurtosis (Fisher or Pearson) of a dataset. - - Kurtosis is the fourth central moment divided by the square of the - variance. If Fisher\'s definition is used, then 3.0 is subtracted from - the result to give 0.0 for a normal distribution. - If bias is False then the kurtosis is calculated using k statistics to - eliminate bias coming from biased moment estimators. - - See scipy.stats for more information - - Parameters - ---------- - fisher : bool, optional - If True, Fisher\'s definition is used (normal ==> 0.0). If False, - Pearson\'s definition is used (normal ==> 3.0). - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").kurtosis()) - shape: (1, 1) - ┌───────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═══════════╡ - │ -1.153061 │ - └───────────┘ - - ''' - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: - ''' - Set values outside the given boundaries to the boundary value. - - Parameters - ---------- - lower_bound - Lower bound. Accepts expression input. - Non-expression inputs are parsed as literals. - upper_bound - Upper bound. Accepts expression input. - Non-expression inputs are parsed as literals. - - See Also - -------- - when - - Notes - ----- - This method only works for numeric and temporal columns. To clip other data - types, consider writing a `when-then-otherwise` expression. See :func:`when`. - - Examples - -------- - Specifying both a lower and upper bound: - - >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) - >>> df.with_columns(clip=pl.col("a").clip(1, 10)) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ clip │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ -50 ┆ 1 │ - │ 5 ┆ 5 │ - │ 50 ┆ 10 │ - │ null ┆ null │ - └──────┴──────┘ - - Specifying only a single bound: - - >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ clip │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ -50 ┆ -50 │ - │ 5 ┆ 5 │ - │ 50 ┆ 10 │ - │ null ┆ null │ - └──────┴──────┘ - - ''' - def lower_bound(self) -> Self: - ''' - Calculate the lower bound. - - Returns a unit Series with the lowest value possible for the dtype of this - expression. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").lower_bound()) - shape: (1, 1) - ┌──────────────────────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════════════════════╡ - │ -9223372036854775808 │ - └──────────────────────┘ - - ''' - def upper_bound(self) -> Self: - ''' - Calculate the upper bound. - - Returns a unit Series with the highest value possible for the dtype of this - expression. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").upper_bound()) - shape: (1, 1) - ┌─────────────────────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════════════════════╡ - │ 9223372036854775807 │ - └─────────────────────┘ - - ''' - def sign(self) -> Self: - ''' - Compute the element-wise indication of the sign. - - The returned values can be -1, 0, or 1: - - * -1 if x < 0. - * 0 if x == 0. - * 1 if x > 0. - - (null values are preserved as-is). - - Examples - -------- - >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) - >>> df.select(pl.col("a").sign()) - shape: (5, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ -1 │ - │ 0 │ - │ 0 │ - │ 1 │ - │ null │ - └──────┘ - - ''' - def sin(self) -> Self: - ''' - Compute the element-wise value for the sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.0]}) - >>> df.select(pl.col("a").sin()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def cos(self) -> Self: - ''' - Compute the element-wise value for the cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.0]}) - >>> df.select(pl.col("a").cos()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def tan(self) -> Self: - ''' - Compute the element-wise value for the tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").tan().round(2)) - shape: (1, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ 1.56 │ - └──────┘ - - ''' - def cot(self) -> Self: - ''' - Compute the element-wise value for the cotangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").cot().round(2)) - shape: (1, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ 0.64 │ - └──────┘ - - ''' - def arcsin(self) -> Self: - ''' - Compute the element-wise value for the inverse sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arcsin()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.570796 │ - └──────────┘ - - ''' - def arccos(self) -> Self: - ''' - Compute the element-wise value for the inverse cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.0]}) - >>> df.select(pl.col("a").arccos()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.570796 │ - └──────────┘ - - ''' - def arctan(self) -> Self: - ''' - Compute the element-wise value for the inverse tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arctan()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.785398 │ - └──────────┘ - - ''' - def sinh(self) -> Self: - ''' - Compute the element-wise value for the hyperbolic sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").sinh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.175201 │ - └──────────┘ - - ''' - def cosh(self) -> Self: - ''' - Compute the element-wise value for the hyperbolic cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").cosh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.543081 │ - └──────────┘ - - ''' - def tanh(self) -> Self: - ''' - Compute the element-wise value for the hyperbolic tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").tanh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.761594 │ - └──────────┘ - - ''' - def arcsinh(self) -> Self: - ''' - Compute the element-wise value for the inverse hyperbolic sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arcsinh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.881374 │ - └──────────┘ - - ''' - def arccosh(self) -> Self: - ''' - Compute the element-wise value for the inverse hyperbolic cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arccosh()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def arctanh(self) -> Self: - ''' - Compute the element-wise value for the inverse hyperbolic tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arctanh()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ inf │ - └─────┘ - - ''' - def degrees(self) -> Self: - ''' - Convert from radians to degrees. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> import math - >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) - >>> df.select(pl.col("a").degrees()) - shape: (9, 1) - ┌────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞════════╡ - │ -720.0 │ - │ -540.0 │ - │ -360.0 │ - │ -180.0 │ - │ 0.0 │ - │ 180.0 │ - │ 360.0 │ - │ 540.0 │ - │ 720.0 │ - └────────┘ - ''' - def radians(self) -> Self: - ''' - Convert from degrees to radians. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) - >>> df.select(pl.col("a").radians()) - shape: (9, 1) - ┌────────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞════════════╡ - │ -12.566371 │ - │ -9.424778 │ - │ -6.283185 │ - │ -3.141593 │ - │ 0.0 │ - │ 3.141593 │ - │ 6.283185 │ - │ 9.424778 │ - │ 12.566371 │ - └────────────┘ - ''' - def reshape(self, dimensions: tuple[int, ...]) -> Self: - ''' - Reshape this Expr to a flat Series or a Series of Lists. - - Parameters - ---------- - dimensions - Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that - dimension is inferred. - - Returns - ------- - Expr - If a single dimension is given, results in an expression of the original - data type. - If a multiple dimensions are given, results in an expression of data type - :class:`List` with shape (rows, cols). - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) - >>> df.select(pl.col("foo").reshape((3, 3))) - shape: (3, 1) - ┌───────────┐ - │ foo │ - │ --- │ - │ list[i64] │ - ╞═══════════╡ - │ [1, 2, 3] │ - │ [4, 5, 6] │ - │ [7, 8, 9] │ - └───────────┘ - - See Also - -------- - Expr.list.explode : Explode a list column. - - ''' - def shuffle(self, seed: int | None = ...) -> Self: - ''' - Shuffle the contents of this expression. - - Parameters - ---------- - seed - Seed for the random number generator. If set to None (default), a - random seed is generated each time the shuffle is called. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").shuffle(seed=1)) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - │ 1 │ - │ 3 │ - └─────┘ - - ''' - def sample(self, n: int | IntoExprColumn | None = ...) -> Self: - ''' - Sample from this expression. - - Parameters - ---------- - n - Number of items to return. Cannot be used with `fraction`. Defaults to 1 if - `fraction` is None. - fraction - Fraction of items to return. Cannot be used with `n`. - with_replacement - Allow values to be sampled more than once. - shuffle - Shuffle the order of sampled data points. - seed - Seed for the random number generator. If set to None (default), a - random seed is generated for each sample operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 3 │ - │ 1 │ - │ 1 │ - └─────┘ - - ''' - def ewm_mean(self) -> Self: - ''' - Exponentially-weighted moving average. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").ewm_mean(com=1)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.0 │ - │ 1.666667 │ - │ 2.428571 │ - └──────────┘ - - ''' - def ewm_std(self) -> Self: - ''' - Exponentially-weighted moving standard deviation. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").ewm_std(com=1)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.0 │ - │ 0.707107 │ - │ 0.963624 │ - └──────────┘ - - ''' - def ewm_var(self) -> Self: - ''' - Exponentially-weighted moving variance. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").ewm_var(com=1)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.0 │ - │ 0.5 │ - │ 0.928571 │ - └──────────┘ - - ''' - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: - ''' - Extremely fast method for extending the Series with \'n\' copies of a value. - - Parameters - ---------- - value - A constant literal value (not an expression) with which to extend the - expression result Series; can pass None to extend with nulls. - n - The number of additional values that will be added. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1, 2, 3]}) - >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) - shape: (5, 1) - ┌────────┐ - │ values │ - │ --- │ - │ i64 │ - ╞════════╡ - │ 0 │ - │ 1 │ - │ 2 │ - │ 99 │ - │ 99 │ - └────────┘ - - ''' - def value_counts(self) -> Self: - ''' - Count the occurrences of unique values. - - Parameters - ---------- - sort - Sort the output by count in descending order. - If set to `False` (default), the order of the output is random. - parallel - Execute the computation in parallel. - - .. note:: - This option should likely not be enabled in a group by context, - as the computation is already parallelized per group. - - Returns - ------- - Expr - Expression of data type :class:`Struct` with mapping of unique values to - their count. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} - ... ) - >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT - shape: (3, 1) - ┌─────────────┐ - │ color │ - │ --- │ - │ struct[2] │ - ╞═════════════╡ - │ {"red",2} │ - │ {"green",1} │ - │ {"blue",3} │ - └─────────────┘ - - Sort the output by count. - - >>> df.select(pl.col("color").value_counts(sort=True)) - shape: (3, 1) - ┌─────────────┐ - │ color │ - │ --- │ - │ struct[2] │ - ╞═════════════╡ - │ {"blue",3} │ - │ {"red",2} │ - │ {"green",1} │ - └─────────────┘ - - ''' - def unique_counts(self) -> Self: - ''' - Return a count of the unique values in the order of appearance. - - This method differs from `value_counts` in that it does not return the - values, only the counts and might be faster - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "id": ["a", "b", "b", "c", "c", "c"], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("id").unique_counts(), - ... ] - ... ) - shape: (3, 1) - ┌─────┐ - │ id │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - ''' - def log(self, base: float = ...) -> Self: - ''' - Compute the logarithm to a given base. - - Parameters - ---------- - base - Given base, defaults to `e` - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").log(base=2)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.0 │ - │ 1.0 │ - │ 1.584963 │ - └──────────┘ - - ''' - def log1p(self) -> Self: - ''' - Compute the natural logarithm of each element plus one. - - This computes `log(1 + x)` but is more numerically stable for `x` close to zero. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").log1p()) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.693147 │ - │ 1.098612 │ - │ 1.386294 │ - └──────────┘ - - ''' - def entropy(self, base: float = ...) -> Self: - ''' - Computes the entropy. - - Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. - - Parameters - ---------- - base - Given base, defaults to `e` - normalize - Normalize pk if it doesn\'t sum to 1. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").entropy(base=2)) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.459148 │ - └──────────┘ - >>> df.select(pl.col("a").entropy(base=2, normalize=False)) - shape: (1, 1) - ┌───────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═══════════╡ - │ -6.754888 │ - └───────────┘ - - ''' - def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: - ''' - Run an expression over a sliding window that increases `1` slot every iteration. - - Parameters - ---------- - expr - Expression to evaluate - min_periods - Number of valid values there should be in the window before the expression - is evaluated. valid values = `length - null_count` - parallel - Run in parallel. Don\'t do this in a group by or another operation that - already has much parallelization. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - This can be really slow as it can have `O(n^2)` complexity. Don\'t use this - for operations that visit all elements. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) - >>> df.select( - ... [ - ... pl.col("values").cumulative_eval( - ... pl.element().first() - pl.element().last() ** 2 - ... ) - ... ] - ... ) - shape: (5, 1) - ┌────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞════════╡ - │ 0.0 │ - │ -3.0 │ - │ -8.0 │ - │ -15.0 │ - │ -24.0 │ - └────────┘ - - ''' - def set_sorted(self) -> Self: - ''' - Flags the expression as \'sorted\'. - - Enables downstream code to user fast paths for sorted arrays. - - Parameters - ---------- - descending - Whether the `Series` order is descending. - - Warnings - -------- - This can lead to incorrect results if this `Series` is not sorted!! - Use with care! - - Examples - -------- - >>> df = pl.DataFrame({"values": [1, 2, 3]}) - >>> df.select(pl.col("values").set_sorted().max()) - shape: (1, 1) - ┌────────┐ - │ values │ - │ --- │ - │ i64 │ - ╞════════╡ - │ 3 │ - └────────┘ - - ''' - def shrink_dtype(self) -> Self: - ''' - Shrink numeric columns to the minimal required datatype. - - Shrink to the dtype needed to fit the extrema of this [`Series`]. - This can be used to reduce memory pressure. - - Examples - -------- - >>> pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": [1, 2, 2 << 32], - ... "c": [-1, 2, 1 << 30], - ... "d": [-112, 2, 112], - ... "e": [-112, 2, 129], - ... "f": ["a", "b", "c"], - ... "g": [0.1, 1.32, 0.12], - ... "h": [True, None, False], - ... } - ... ).select(pl.all().shrink_dtype()) - shape: (3, 8) - ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ - ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ - │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ - │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ - │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ - └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ - - ''' - def cache(self) -> Self: - """ - Cache this expression so that it only is executed once per context. - - .. deprecated:: 0.18.9 - This method now does nothing. It has been superseded by the - `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically - caches expressions that are equal. - - """ - def replace(self, mapping: dict[Any, Any]) -> Self: - ''' - Replace values according to the given mapping. - - Needs a global string cache for lazily evaluated queries on columns of - type `Categorical`. - - Parameters - ---------- - mapping - Mapping of values to their replacement. - default - Value to use when the mapping does not contain the lookup value. - Defaults to keeping the original value. Accepts expression input. - Non-expression inputs are parsed as literals. - return_dtype - Set return dtype to override automatic return dtype determination. - - See Also - -------- - str.replace - - Examples - -------- - Replace a single value by another value. Values not in the mapping remain - unchanged. - - >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) - >>> df.with_columns(pl.col("a").replace({2: 100}).alias("replaced")) - shape: (4, 2) - ┌─────┬──────────┐ - │ a ┆ replaced │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════════╡ - │ 1 ┆ 1 │ - │ 2 ┆ 100 │ - │ 2 ┆ 100 │ - │ 3 ┆ 3 │ - └─────┴──────────┘ - - Replace multiple values. Specify a default to set values not in the given map - to the default value. - - >>> df = pl.DataFrame({"country_code": ["FR", "ES", "DE", None]}) - >>> country_code_map = { - ... "CA": "Canada", - ... "DE": "Germany", - ... "FR": "France", - ... None: "unspecified", - ... } - >>> df.with_columns( - ... pl.col("country_code") - ... .replace(country_code_map, default=None) - ... .alias("replaced") - ... ) - shape: (4, 2) - ┌──────────────┬─────────────┐ - │ country_code ┆ replaced │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞══════════════╪═════════════╡ - │ FR ┆ France │ - │ ES ┆ null │ - │ DE ┆ Germany │ - │ null ┆ unspecified │ - └──────────────┴─────────────┘ - - The return type can be overridden with the `return_dtype` argument. - - >>> df = df.with_row_count() - >>> df.select( - ... "row_nr", - ... pl.col("row_nr") - ... .replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) - ... .alias("replaced"), - ... ) - shape: (4, 2) - ┌────────┬──────────┐ - │ row_nr ┆ replaced │ - │ --- ┆ --- │ - │ u32 ┆ u8 │ - ╞════════╪══════════╡ - │ 0 ┆ 0 │ - │ 1 ┆ 10 │ - │ 2 ┆ 20 │ - │ 3 ┆ 0 │ - └────────┴──────────┘ - - To reference other columns as a `default` value, a struct column must be - constructed first. The first field must be the column in which values are - replaced. The other columns can be used in the default expression. - - >>> df.with_columns( - ... pl.struct("country_code", "row_nr") - ... .replace( - ... mapping=country_code_map, - ... default=pl.col("row_nr").cast(pl.Utf8), - ... ) - ... .alias("replaced") - ... ) - shape: (4, 3) - ┌────────┬──────────────┬─────────────┐ - │ row_nr ┆ country_code ┆ replaced │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ str ┆ str │ - ╞════════╪══════════════╪═════════════╡ - │ 0 ┆ FR ┆ France │ - │ 1 ┆ ES ┆ 1 │ - │ 2 ┆ DE ┆ Germany │ - │ 3 ┆ null ┆ unspecified │ - └────────┴──────────────┴─────────────┘ - ''' - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: - """ - Apply a custom python function to a Series or sequence of Series. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Expr.map_batches`. - - Parameters - ---------- - function - Lambda/ function to apply. - return_dtype - Dtype of the output Series. - agg_list - Aggregate list - - """ - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - """ - Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Expr.map_elements`. - - Parameters - ---------- - function - Lambda/ function to apply. - return_dtype - Dtype of the output Series. - If not set, the dtype will be - `polars.Unknown`. - skip_nulls - Don't apply the function over values - that contain nulls. This is faster. - pass_name - Pass the Series name to the custom function - This is more expensive. - strategy : {'thread_local', 'threading'} - This functionality is in `alpha` stage. This may be removed - /changed without it being considered a breaking change. - - - 'thread_local': run the python function on a single thread. - - 'threading': run the python function on separate threads. Use with - care as this can slow performance. This might only speed up - your code if the amount of work per element is significant - and the python function releases the GIL (e.g. via calling - a c function) - - """ - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - """ - Apply a custom rolling window function. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Expr.rolling_map`. - - Parameters - ---------- - function - Aggregation function - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - """ - def is_first(self) -> Self: - """ - Return a boolean mask indicating the first occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Expr.is_first_distinct`. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - """ - def is_last(self) -> Self: - """ - Return a boolean mask indicating the last occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Expr.is_last_distinct`. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - """ - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: - """ - Clip (limit) the values in an array to a `min` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - lower_bound - Lower bound. - - """ - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: - """ - Clip (limit) the values in an array to a `max` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - upper_bound - Upper bound. - - """ - def shift_and_fill(self, fill_value: IntoExpr) -> Self: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - Fill None values with the result of this expression. - n - Number of places to shift (may be negative). - - """ - def register_plugin(self) -> Self: - """ - Register a shared library as a plugin. - - .. warning:: - This is highly unsafe as this will call the C function - loaded by `lib::symbol`. - - The parameters you give dictate how polars will deal - with the function. Make sure they are correct! - - .. note:: - This functionality is unstable and may change without it - being considered breaking. - - Parameters - ---------- - lib - Library to load. - symbol - Function to load. - args - Arguments (other than self) passed to this function. - These arguments have to be of type Expression. - kwargs - Non-expression arguments. They must be JSON serializable. - is_elementwise - If the function only operates on scalars - this will trigger fast paths. - input_wildcard_expansion - Expand expressions as input of this function. - returns_scalar - Automatically explode on unit length if it ran as final aggregation. - this is the case for aggregations like `sum`, `min`, `covariance` etc. - cast_to_supertypes - Cast the input datatypes to their supertype. - pass_name_to_apply - if set, then the `Series` passed to the function in the group_by operation - will ensure the name is set. This is an extra heap allocation per group. - changes_length - For example a `unique` or a `slice` - - """ - def _register_plugin(self) -> Self: ... - def take_every(self, n: int) -> Self: - """ - Take every nth value in the Series and return as a new Series. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: - """ - Take values by index. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather`. - - Parameters - ---------- - indices - An expression that leads to a UInt32 dtyped Series. - """ - def cumsum(self) -> Self: - """ - Get an array with the cumulative sum computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_sum`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cumprod(self) -> Self: - """ - Get an array with the cumulative product computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_prod`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cummin(self) -> Self: - """ - Get an array with the cumulative min computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_min`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cummax(self) -> Self: - """ - Get an array with the cumulative max computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_max`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cumcount(self) -> Self: - """ - Get an array with the cumulative count computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_count`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def map_dict(self, mapping: dict[Any, Any]) -> Self: - """ - Replace values in column according to remapping dictionary. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`replace`. The default behavior - has changed to keep any values not present in the mapping unchanged. - Pass `default=None` to keep existing behavior. - - Parameters - ---------- - mapping - Dictionary containing the before/after values to map. - default - Value to use when the remapping dict does not contain the lookup value. - Accepts expression input. Non-expression inputs are parsed as literals. - Use `pl.first()`, to keep the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - - """ - @property - def bin(self): ... - @property - def cat(self): ... - @property - def dt(self): ... - @property - def list(self): ... - @property - def arr(self): ... - @property - def meta(self): ... - @property - def name(self): ... - @property - def str(self): ... - @property - def struct(self): ... -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: - """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/expr/expr.pyi new file mode 100644 index 0000000..6ee8e69 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/expr/expr.pyi @@ -0,0 +1,8388 @@ +#: version 0.20.0 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import UInt32 as UInt32 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import _warn_null_comparison as _warn_null_comparison, no_default as no_default, sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self) -> Self: + ''' + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.map`. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + keep_name + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.prefix`. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.suffix`. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.keep`. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).name.keep()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with `^` and end with `$`. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns( + ... pl.all().is_not_null().name.suffix("_not_null") # nan != null + ... ) + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Return the number of non-null elements in the column. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + See Also + -------- + len + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 2 │ + └─────┴─────┘ + ''' + def len(self) -> Self: + ''' + Return the number of elements in the column. + + Null values count towards the total. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + See Also + -------- + count + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + + ''' + def cum_sum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_sum().alias("cum_sum"), + ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_sum ┆ cum_sum_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 10 │ + │ 2 ┆ 3 ┆ 9 │ + │ 3 ┆ 6 ┆ 7 │ + │ 4 ┆ 10 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_sum().alias("value_cum_sum"), + ... pl.col("values") + ... .cum_sum() + ... .forward_fill() + ... .alias("value_cum_sum_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬───────────────┬──────────────────────────┐ + │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═══════════════╪══════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴───────────────┴──────────────────────────┘ + + ''' + def cum_prod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_prod().alias("cum_prod"), + ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), + ... ) + shape: (4, 3) + ┌─────┬──────────┬──────────────────┐ + │ a ┆ cum_prod ┆ cum_prod_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════════╪══════════════════╡ + │ 1 ┆ 1 ┆ 24 │ + │ 2 ┆ 2 ┆ 24 │ + │ 3 ┆ 6 ┆ 12 │ + │ 4 ┆ 24 ┆ 4 │ + └─────┴──────────┴──────────────────┘ + + ''' + def cum_min(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_min().alias("cum_min"), + ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_min ┆ cum_min_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 1 ┆ 3 │ + │ 4 ┆ 1 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + ''' + def cum_max(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_max().alias("cum_max"), + ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_max ┆ cum_max_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 2 ┆ 4 │ + │ 3 ┆ 3 ┆ 4 │ + │ 4 ┆ 4 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_max().alias("cum_max"), + ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬─────────┬────────────────────┐ + │ values ┆ cum_max ┆ cum_max_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════════╪════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴─────────┴────────────────────┘ + + ''' + def cum_count(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_count().alias("cum_count"), + ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), + ... ) + shape: (4, 3) + ┌─────┬───────────┬───────────────────┐ + │ a ┆ cum_count ┆ cum_count_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ u32 ┆ u32 │ + ╞═════╪═══════════╪═══════════════════╡ + │ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 2 ┆ 1 │ + │ 4 ┆ 3 ┆ 0 │ + └─────┴───────────┴───────────────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def round_sig_figs(self, digits: int) -> Self: + ''' + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) + >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) + shape: (3, 2) + ┌─────────┬────────────────┐ + │ a ┆ round_sig_figs │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════════╪════════════════╡ + │ 0.01234 ┆ 0.012 │ + │ 3.333 ┆ 3.3 │ + │ 1234.0 ┆ 1200.0 │ + └─────────┴────────────────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + See Also + -------- + Expr.get : Take a single value + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg( + ... pl.col("value").gather([2, 1]) + ... ) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ one ┆ [2, 98] │ + │ two ┆ [4, 99] │ + └───────┴───────────┘ + ''' + def get(self, index: int | Expr) -> Self: + ''' + Return a single value by index. + + Parameters + ---------- + index + An expression that leads to a UInt32 index. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns(shift=pl.col("a").shift()) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴───────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.with_columns(shift=pl.col("a").shift(-2)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ null │ + │ 4 ┆ null │ + └─────┴───────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ 100 │ + │ 4 ┆ 100 │ + └─────┴───────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().name.suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns( + ... pl.col("c").max().over("a").name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns( + ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns( + ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns( + ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def rolling(self, index_column: str) -> Self: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), + ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for `map` functions is transforming the values + represented by an expression using a third-party library. + + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Notes + ----- + If you are looking to map a function over a window function or group_by context, + refer to func:`map_elements` instead. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_elements + replace + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type `Callable[[Any], Any]`. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type `Callable[[Series], Any]`. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be `pl.Unknown`. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using `map_elements` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using `over` is considered a GroupBy context + here, so `map_elements` can be used to map functions over window groups. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using `over` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").gather_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator `expr & other & ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator `expr | other | ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other` where `None == None`. + + This differs from default `eq` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator `expr >= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ true │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator `expr > other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator `expr <= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator `expr < other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator `expr != other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr != other` where `None == None`. + + This differs from default `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator `expr + other`. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator `expr // other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator `expr % other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator `expr * other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator `expr - other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator `expr / other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator `expr ** exponent`. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator `expr ^ other`. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) + shape: (3, 3) + ┌───────────┬──────────────────┬──────────┐ + │ sets ┆ optional_members ┆ contains │ + │ --- ┆ --- ┆ --- │ + │ list[i64] ┆ i64 ┆ bool │ + ╞═══════════╪══════════════════╪══════════╡ + │ [1, 2, 3] ┆ 1 ┆ true │ + │ [1, 2] ┆ 2 ┆ true │ + │ [9, 10] ┆ 3 ┆ false │ + └───────────┴──────────────────┴──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with `lit` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ NaN │ + │ 3.0 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: + ''' + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) + >>> df.with_columns(clip=pl.col("a").clip(1, 10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + Specifying only a single bound: + + >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def cot(self) -> Self: + ''' + Compute the element-wise value for the cotangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cot().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 0.64 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | IntoExprColumn | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def hist(self, bins: IntoExpr | None = ...) -> Self: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + include_breakpoint + Include a column that indicates the upper breakpoint. + include_category + Include a column that shows the intervals as categories. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 3, 8, 8, 2, 1, 3]}) + >>> df.select(pl.col("a").hist(bins=[1, 2, 3])) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 2 │ + │ 2 │ + └─────┘ + >>> df.select( + ... pl.col("a").hist( + ... bins=[1, 2, 3], include_breakpoint=True, include_category=True + ... ) + ... ) + shape: (4, 1) + ┌───────────────────────┐ + │ a │ + │ --- │ + │ struct[3] │ + ╞═══════════════════════╡ + │ {1.0,"(-inf, 1.0]",2} │ + │ {2.0,"(1.0, 2.0]",1} │ + │ {3.0,"(2.0, 3.0]",2} │ + │ {inf,"(3.0, inf]",2} │ + └───────────────────────┘ + + ''' + def replace(self, old: IntoExpr | Sequence[Any] | Mapping[Any, Any], new: IntoExpr | Sequence[Any] | NoDefault = ...) -> Self: + ''' + Replace values by different values. + + Parameters + ---------- + old + Value or sequence of values to replace. + Accepts expression input. Sequences are parsed as Series, + other non-expression inputs are parsed as literals. + Also accepts a mapping of values to their replacement as syntactic sugar for + `replace(new=Series(mapping.keys()), old=Series(mapping.values()))`. + new + Value or sequence of values to replace by. + Accepts expression input. Sequences are parsed as Series, + other non-expression inputs are parsed as literals. + Length must match the length of `old` or have length 1. + default + Set values that were not replaced to this value. + Defaults to keeping the original value. + Accepts expression input. Non-expression inputs are parsed as literals. + return_dtype + The data type of the resulting expression. If set to `None` (default), + the data type is determined automatically based on the other inputs. + + See Also + -------- + str.replace + + Notes + ----- + The global string cache must be enabled when replacing categorical values. + + Examples + -------- + Replace a single value by another value. Values that were not replaced remain + unchanged. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) + >>> df.with_columns(replaced=pl.col("a").replace(2, 100)) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 3 │ + └─────┴──────────┘ + + Replace multiple values by passing sequences to the `old` and `new` parameters. + + >>> df.with_columns(replaced=pl.col("a").replace([2, 3], [100, 200])) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 200 │ + └─────┴──────────┘ + + Passing a mapping with replacements is also supported as syntactic sugar. + Specify a default to set all values that were not matched. + + >>> mapping = {2: 100, 3: 200} + >>> df.with_columns(replaced=pl.col("a").replace(mapping, default=-1)) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ -1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 200 │ + └─────┴──────────┘ + + Replacing by values of a different data type sets the return type based on + a combination of the `new` data type and either the original data type or the + default data type if it was set. + + >>> df = pl.DataFrame({"a": ["x", "y", "z"]}) + >>> mapping = {"x": 1, "y": 2, "z": 3} + >>> df.with_columns(replaced=pl.col("a").replace(mapping)) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + >>> df.with_columns(replaced=pl.col("a").replace(mapping, default=None)) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + + Set the `return_dtype` parameter to control the resulting data type directly. + + >>> df.with_columns( + ... replaced=pl.col("a").replace(mapping, return_dtype=pl.UInt8) + ... ) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ u8 │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + + Expression input is supported for all parameters. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3], "b": [1.5, 2.5, 5.0, 1.0]}) + >>> df.with_columns( + ... replaced=pl.col("a").replace( + ... old=pl.col("a").max(), + ... new=pl.col("b").sum(), + ... default=pl.col("b"), + ... ) + ... ) + shape: (4, 3) + ┌─────┬─────┬──────────┐ + │ a ┆ b ┆ replaced │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═════╪══════════╡ + │ 1 ┆ 1.5 ┆ 1.5 │ + │ 2 ┆ 2.5 ┆ 2.5 │ + │ 2 ┆ 5.0 ┆ 5.0 │ + │ 3 ┆ 1.0 ┆ 10.0 │ + └─────┴─────┴──────────┘ + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + `polars.Unknown`. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def register_plugin(self) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by `lib::symbol`. + + The parameters you give dictate how polars will deal + with the function. Make sure they are correct! + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + These arguments have to be of type Expression. + kwargs + Non-expression arguments. They must be JSON serializable. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + returns_scalar + Automatically explode on unit length if it ran as final aggregation. + this is the case for aggregations like `sum`, `min`, `covariance` etc. + cast_to_supertypes + Cast the input datatypes to their supertype. + pass_name_to_apply + if set, then the `Series` passed to the function in the group_by operation + will ensure the name is set. This is an extra heap allocation per group. + changes_length + For example a `unique` or a `slice` + + """ + def _register_plugin(self) -> Self: ... + def take_every(self, n: int) -> Self: + """ + Take every nth value in the Series and return as a new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + """ + def cumsum(self) -> Self: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumprod(self) -> Self: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummin(self) -> Self: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummax(self) -> Self: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumcount(self) -> Self: + """ + Get an array with the cumulative count computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_count`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in column according to remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def name(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/lazyframe/frame deleted file mode 100644 index 561f5b2..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/lazyframe/frame +++ /dev/null @@ -1,4211 +0,0 @@ -import P -import np -import pa -from builtins import PyLazyFrame -from pathlib import Path -from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 -from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype -from polars.dependencies import dataframe_api_compat as dataframe_api_compat -from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud -from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy -from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath -from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence - -TYPE_CHECKING: bool -DTYPE_TEMPORAL_UNITS: frozenset -N_INFER_DEFAULT: int - -class LazyFrame: - _accessors: _ClassVar[set] = ... - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - @classmethod - def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: - """ - Lazily read from a CSV file or multiple files via glob patterns. - - Use `pl.scan_csv` to dispatch to this method. - - See Also - -------- - polars.io.scan_csv - - """ - @classmethod - def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: - """ - Lazily read from a parquet file or multiple files via glob patterns. - - Use `pl.scan_parquet` to dispatch to this method. - - See Also - -------- - polars.io.scan_parquet - - """ - @classmethod - def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: - """ - Lazily read from an Arrow IPC (Feather v2) file. - - Use `pl.scan_ipc` to dispatch to this method. - - See Also - -------- - polars.io.scan_ipc - - """ - @classmethod - def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: - """ - Lazily read from a newline delimited JSON file. - - Use `pl.scan_ndjson` to dispatch to this method. - - See Also - -------- - polars.io.scan_ndjson - - """ - @classmethod - def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: - """ - Read a logical plan from a JSON string to construct a LazyFrame. - - .. deprecated:: 0.18.12 - This method is deprecated. Convert the JSON string to `StringIO` - and then use `LazyFrame.deserialize`. - - Parameters - ---------- - json - String in JSON format. - - See Also - -------- - deserialize - - """ - @classmethod - def read_json(cls, source: str | Path | IOBase) -> Self: - """ - Read a logical plan from a JSON file to construct a LazyFrame. - - .. deprecated:: 0.18.12 - This class method has been renamed to `deserialize`. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - - See Also - -------- - deserialize - - """ - @classmethod - def deserialize(cls, source: str | Path | IOBase) -> Self: - ''' - Read a logical plan from a JSON file to construct a LazyFrame. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - - See Also - -------- - LazyFrame.serialize - - Examples - -------- - >>> import io - >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() - >>> json = lf.serialize() - >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 6 │ - └─────┘ - - ''' - def __dataframe_consortium_standard__(self) -> Any: - """ - Provide entry point to the Consortium DataFrame Standard API. - - This is developed and maintained outside of polars. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. - """ - def __bool__(self) -> NoReturn: ... - def _comparison_error(self, operator: str) -> NoReturn: ... - def __eq__(self, other: Any) -> NoReturn: ... - def __ne__(self, other: Any) -> NoReturn: ... - def __gt__(self, other: Any) -> NoReturn: ... - def __lt__(self, other: Any) -> NoReturn: ... - def __ge__(self, other: Any) -> NoReturn: ... - def __le__(self, other: Any) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def _repr_html_(self) -> str: ... - def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: - ''' - Serialize the logical plan of this LazyFrame to a file or string in JSON format. - - Parameters - ---------- - file - File path to which the result should be written. If set to `None` - (default), the output is returned as a string instead. - - See Also - -------- - LazyFrame.deserialize - - Examples - -------- - Serialize the logical plan into a JSON string. - - >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() - >>> json = lf.serialize() - >>> json - \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' - - The logical plan can later be deserialized back into a LazyFrame. - - >>> import io - >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 6 │ - └─────┘ - - ''' - def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: - """ - Serialize the logical plan of this LazyFrame to a file or string in JSON format. - - .. deprecated:: 0.18.12 - This method has been renamed to :func:`LazyFrame.serialize`. - - Parameters - ---------- - file - File path to which the result should be written. If set to `None` - (default), the output is returned as a string instead. - """ - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: - ''' - Offers a structured way to apply a sequence of user-defined functions (UDFs). - - Parameters - ---------- - function - Callable; will receive the frame as the first parameter, - followed by any given args/kwargs. - *args - Arguments to pass to the UDF. - **kwargs - Keyword arguments to pass to the UDF. - - Examples - -------- - >>> def cast_str_to_int(data, col_name): - ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) - ... - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": ["10", "20", "30", "40"], - ... } - ... ) - >>> lf.pipe(cast_str_to_int, col_name="b").collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 10 │ - │ 2 ┆ 20 │ - │ 3 ┆ 30 │ - │ 4 ┆ 40 │ - └─────┴─────┘ - - >>> lf = pl.LazyFrame( - ... { - ... "b": [1, 2], - ... "a": [3, 4], - ... } - ... ) - >>> lf.collect() - shape: (2, 2) - ┌─────┬─────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - └─────┴─────┘ - >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 1 │ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def explain(self) -> str: - ''' - Create a string representation of the query plan. - - Different optimizations can be turned on or off. - - Parameters - ---------- - optimized - Return an optimized query plan. Defaults to `True`. - If this is set to `True` the subsequent - optimization flags control which optimizations - run. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( - ... "a" - ... ).explain() # doctest: +SKIP - ''' - def show_graph(self) -> str | None: - ''' - Show a plot of the query plan. Note that you should have graphviz installed. - - Parameters - ---------- - optimized - Optimize the query plan. - show - Show the figure. - output_path - Write the figure to disk. - raw_output - Return dot syntax. This cannot be combined with `show` and/or `output_path`. - figsize - Passed to matplotlib if `show` == True. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( - ... "a" - ... ).show_graph() # doctest: +SKIP - - ''' - def inspect(self, fmt: str = ...) -> Self: - ''' - Inspect a node in the computation graph. - - Print the value that this node in the computation graph evaluates to and passes - on the value. - - Examples - -------- - >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) - >>> ( - ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) - ... .inspect() # print the node before the filter - ... .filter(pl.col("bar") == pl.col("foo")) - ... ) # doctest: +ELLIPSIS - - - ''' - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: - ''' - Sort the DataFrame by the given columns. - - Parameters - ---------- - by - Column(s) to sort by. Accepts expression input. Strings are parsed as column - names. - *more_by - Additional columns to sort by, specified as positional arguments. - descending - Sort in descending order. When sorting by multiple columns, can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - Examples - -------- - Pass a single column name to sort by that column. - - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, None], - ... "b": [6.0, 5.0, 4.0], - ... "c": ["a", "c", "b"], - ... } - ... ) - >>> lf.sort("a").collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - Sorting by expressions is also supported. - - >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - └──────┴─────┴─────┘ - - Sort by multiple columns by passing a list of columns. - - >>> lf.sort(["c", "a"], descending=True).collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - └──────┴─────┴─────┘ - - Or use positional arguments to sort by multiple columns in the same way. - - >>> lf.sort("c", "a", descending=[False, True]).collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - ''' - def top_k(self, k: int) -> Self: - ''' - Return the `k` largest elements. - - If \'descending=True` the smallest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might - be worse since this requires a stable search. - - See Also - -------- - bottom_k - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 largest values in column b. - - >>> lf.top_k(4, by="b").collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ a ┆ 2 │ - │ b ┆ 2 │ - │ b ┆ 1 │ - └─────┴─────┘ - - Get the rows which contain the 4 largest values when sorting on column b and a. - - >>> lf.top_k(4, by=["b", "a"]).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 2 │ - │ c ┆ 1 │ - └─────┴─────┘ - - ''' - def bottom_k(self, k: int) -> Self: - ''' - Return the `k` smallest elements. - - If \'descending=True` the largest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - See Also - -------- - top_k - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 smallest values in column b. - - >>> lf.bottom_k(4, by="b").collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 1 │ - │ a ┆ 1 │ - │ c ┆ 1 │ - │ a ┆ 2 │ - └─────┴─────┘ - - Get the rows which contain the 4 smallest values when sorting on column a and b. - - >>> lf.bottom_k(4, by=["a", "b"]).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ b ┆ 1 │ - │ b ┆ 2 │ - └─────┴─────┘ - - ''' - def profile(self) -> tuple[DataFrame, DataFrame]: - ''' - Profile a LazyFrame. - - This will run the query and return a tuple - containing the materialized DataFrame and a DataFrame that - contains profiling information of each node that is executed. - - The units of the timings are microseconds. - - Parameters - ---------- - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off (certain) optimizations. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - show_plot - Show a gantt chart of the profiling result - truncate_nodes - Truncate the label lengths in the gantt chart to this number of - characters. - figsize - matplotlib figsize of the profiling plot - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( - ... "a" - ... ).profile() # doctest: +SKIP - (shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘, - shape: (3, 3) - ┌─────────────────────────┬───────┬──────┐ - │ node ┆ start ┆ end │ - │ --- ┆ --- ┆ --- │ - │ str ┆ u64 ┆ u64 │ - ╞═════════════════════════╪═══════╪══════╡ - │ optimization ┆ 0 ┆ 5 │ - │ group_by_partitioned(a) ┆ 5 ┆ 470 │ - │ sort(a) ┆ 475 ┆ 1964 │ - └─────────────────────────┴───────┴──────┘) - - ''' - def collect(self) -> DataFrame: - ''' - Materialize this LazyFrame into a DataFrame. - - By default, all query optimizations are enabled. Individual optimizations may - be disabled by setting the corresponding parameter to `False`. - - Parameters - ---------- - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - no_optimization - Turn off (certain) optimizations. - streaming - Process the query in batches to handle larger-than-memory data. - If set to `False` (default), the entire query is processed in a single - batch. - - .. warning:: - This functionality is currently in an alpha state. - - .. note:: - Use :func:`explain` to see if Polars can process the query in streaming - mode. - - Returns - ------- - DataFrame - - See Also - -------- - fetch: Run the query on the first `n` rows only for debugging purposes. - explain : Print the query plan that is evaluated with collect. - profile : Collect the LazyFrame and time each node in the computation graph. - polars.collect_all : Collect multiple LazyFrames at the same time. - polars.Config.set_streaming_chunk_size : Set the size of streaming batches. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘ - - Collect in streaming mode - - >>> lf.group_by("a").agg(pl.all().sum()).collect( - ... streaming=True - ... ) # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘ - - ''' - def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: - ''' - Collect DataFrame asynchronously in thread pool. - - Collects into a DataFrame (like :func:`collect`), but instead of returning - DataFrame directly, they are scheduled to be collected inside thread pool, - while this method returns almost instantly. - - May be useful if you use gevent or asyncio and want to release control to other - greenlets/tasks while LazyFrames are being collected. - - Parameters - ---------- - gevent - Return wrapper to `gevent.event.AsyncResult` instead of Awaitable - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off (certain) optimizations. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Notes - ----- - In case of error `set_exception` is used on - `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - See Also - -------- - polars.collect_all : Collect multiple LazyFrames at the same time. - polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. - - Returns - ------- - If `gevent=False` (default) then returns awaitable. - - If `gevent=True` then returns wrapper that has - `.get(block=True, timeout=None)` method. - - Examples - -------- - >>> import asyncio - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> async def main(): - ... return await ( - ... lf.group_by("a", maintain_order=True) - ... .agg(pl.all().sum()) - ... .collect_async() - ... ) - ... - >>> asyncio.run(main()) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘ - ''' - def sink_parquet(self, path: str | Path) -> DataFrame: - ''' - Evaluate the query in streaming mode and write to a Parquet file. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} - Choose "zstd" for good compression performance. - Choose "lz4" for fast compression/decompression. - Choose "snappy" for more backwards compatibility guarantees - when you deal with older parquet readers. - compression_level - The level of compression to use. Higher compression means smaller files on - disk. - - - "gzip" : min-level: 0, max-level: 10. - - "brotli" : min-level: 0, max-level: 11. - - "zstd" : min-level: 1, max-level: 22. - statistics - Write statistics to the parquet headers. This requires extra compute. - row_group_size - Size of the row groups in number of rows. - If None (default), the chunks of the `DataFrame` are - used. Writing in smaller chunks may reduce memory pressure and improve - writing speeds. - data_pagesize_limit - Size limit of individual data pages. - If not set defaults to 1024 * 1024 bytes - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - no_optimization - Turn off (certain) optimizations. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_parquet("out.parquet") # doctest: +SKIP - - ''' - def sink_ipc(self, path: str | Path) -> DataFrame: - ''' - Evaluate the query in streaming mode and write to an IPC file. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - compression : {\'lz4\', \'zstd\'} - Choose "zstd" for good compression performance. - Choose "lz4" for fast compression/decompression. - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - no_optimization - Turn off (certain) optimizations. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_ipc("out.arrow") # doctest: +SKIP - - ''' - def sink_csv(self, path: str | Path) -> DataFrame: - ''' - Evaluate the query in streaming mode and write to a CSV file. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - include_bom - Whether to include UTF-8 BOM in the CSV output. - include_header - Whether to include header in the CSV output. - separator - Separate CSV fields with this symbol. - line_terminator - String used to end each row. - quote_char - Byte to use as quoting character. - batch_size - Number of rows that will be processed per thread. - datetime_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. If no format specified, the default fractional-second - precision is inferred from the maximum timeunit found in the frame\'s - Datetime cols (if any). - date_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - time_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - float_precision - Number of decimal places to write, applied to both `Float32` and - `Float64` datatypes. - null_value - A string representing null values (defaulting to the empty string). - quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} - Determines the quoting strategy used. - - - necessary (default): This puts quotes around fields only when necessary. - They are necessary when fields contain a quote, - delimiter or record terminator. - Quotes are also necessary when writing an empty record - (which is indistinguishable from a record with one empty field). - This is the default. - - always: This puts quotes around every field. Always. - - never: This never puts quotes around fields, even if that results in - invalid CSV data (e.g.: by not quoting strings containing the - separator). - - non_numeric: This puts quotes around all fields that are non-numeric. - Namely, when writing a field that does not parse as a valid float - or integer, then quotes will be used even if they aren`t strictly - necessary. - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - no_optimization - Turn off (certain) optimizations. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_csv("out.csv") # doctest: +SKIP - - ''' - def sink_ndjson(self, path: str | Path) -> DataFrame: - ''' - Persists a LazyFrame at the provided path. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off (certain) optimizations. - slice_pushdown - Slice pushdown optimization. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_json("out.json") # doctest: +SKIP - - ''' - def _set_sink_optimizations(self) -> PyLazyFrame: ... - def fetch(self, n_rows: int = ...) -> DataFrame: - ''' - Collect a small number of rows for debugging purposes. - - Parameters - ---------- - n_rows - Collect n_rows from the data sources. - type_coercion - Run type coercion optimization. - predicate_pushdown - Run predicate pushdown optimization. - projection_pushdown - Run projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off optimizations. - slice_pushdown - Slice pushdown optimization - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Notes - ----- - This is similar to a :func:`collect` operation, but it overwrites the number of - rows read by *every* scan operation. Be aware that `fetch` does not guarantee - the final number of rows in the DataFrame. Filters, join operations and fewer - rows being available in the scanned data will all influence the final number - of rows (joins are especially susceptible to this, and may return no data - at all if `n_rows` is too small as the join keys may not be present). - - Warnings - -------- - This is strictly a utility function that can help to debug queries using a - smaller number of rows, and should *not* be used in production code. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 6 │ - │ b ┆ 2 ┆ 5 │ - └─────┴─────┴─────┘ - - ''' - def lazy(self) -> Self: - ''' - Return lazy representation, i.e. itself. - - Useful for writing code that expects either a :class:`DataFrame` or - :class:`LazyFrame`. - - Returns - ------- - LazyFrame - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> lf.lazy() # doctest: +ELLIPSIS - - - ''' - def cache(self) -> Self: - """Cache the result once the execution of the physical plan hits this node.""" - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: - ''' - Cast LazyFrame column(s) to the specified dtype(s). - - Parameters - ---------- - dtypes - Mapping of column names (or selector) to dtypes, or a single dtype - to which all columns will be cast. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> from datetime import date - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], - ... } - ... ) - - Cast specific frame columns to the specified dtypes: - - >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ u8 ┆ date │ - ╞═════╪═════╪════════════╡ - │ 1.0 ┆ 6 ┆ 2020-01-02 │ - │ 2.0 ┆ 7 ┆ 2021-03-04 │ - │ 3.0 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - Cast all frame columns to the specified dtype: - - >>> lf.cast(pl.Utf8).collect().to_dict(as_series=False) - {\'foo\': [\'1\', \'2\', \'3\'], - \'bar\': [\'6.0\', \'7.0\', \'8.0\'], - \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} - - Use selectors to define the columns being cast: - - >>> import polars.selectors as cs - >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ str │ - ╞═════╪═════╪════════════╡ - │ 1 ┆ 6 ┆ 2020-01-02 │ - │ 2 ┆ 7 ┆ 2021-03-04 │ - │ 3 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - ''' - def clear(self, n: int = ...) -> LazyFrame: - ''' - Create an empty copy of the current LazyFrame, with zero to \'n\' rows. - - Returns a copy with an identical schema but no data. - - Parameters - ---------- - n - Number of (empty) rows to return in the cleared frame. - - See Also - -------- - clone : Cheap deepcopy/clone. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> lf.clear().fetch() - shape: (0, 3) - ┌─────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞═════╪═════╪══════╡ - └─────┴─────┴──────┘ - - >>> lf.clear(2).fetch() - shape: (2, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪══════╪══════╡ - │ null ┆ null ┆ null │ - │ null ┆ null ┆ null │ - └──────┴──────┴──────┘ - - ''' - def clone(self) -> Self: - ''' - Create a copy of this LazyFrame. - - This is a cheap operation that does not copy data. - - See Also - -------- - clear : Create an empty copy of the current LazyFrame, with identical - schema but no data. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> lf.clone() # doctest: +ELLIPSIS - - - ''' - def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: - ''' - Filter the rows in the LazyFrame based on a predicate expression. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - predicates - Expression that evaluates to a boolean Series. - constraints - Column filters. Use name=value to filter column name by the supplied value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - - Filter on one condition: - - >>> lf.filter(pl.col("foo") > 1).collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Filter on multiple conditions: - - >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Provide multiple filters using `*args` syntax: - - >>> lf.filter( - ... pl.col("foo") == 1, - ... pl.col("ham") == "a", - ... ).collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Provide multiple filters using `**kwargs` syntax: - - >>> lf.filter(foo=1, ham="a").collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Filter on an OR condition: - - >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - ''' - Select columns from this LazyFrame. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Examples - -------- - Pass the name of a column to select that column. - - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.select("foo").collect() - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - Multiple columns can be selected by passing a list of column names. - - >>> lf.select(["foo", "bar"]).collect() - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 6 │ - │ 2 ┆ 7 │ - │ 3 ┆ 8 │ - └─────┴─────┘ - - Multiple columns can also be selected using positional arguments instead of a - list. Expressions are also accepted. - - >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - └─────┴─────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> lf.select( - ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) - ... ).collect() - shape: (3, 1) - ┌───────────┐ - │ threshold │ - │ --- │ - │ i32 │ - ╞═══════════╡ - │ 0 │ - │ 0 │ - │ 10 │ - └───────────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... lf.select( - ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), - ... ).collect() - ... - shape: (3, 1) - ┌───────────┐ - │ is_odd │ - │ --- │ - │ struct[2] │ - ╞═══════════╡ - │ {1,0} │ - │ {0,1} │ - │ {1,0} │ - └───────────┘ - - ''' - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - """ - Select columns from this LazyFrame. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - See Also - -------- - select - - """ - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: - ''' - Start a group by operation. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Setting this to `True` blocks the possibility - to run on the streaming engine. - - Examples - -------- - Group by one column and call `agg` to compute the grouped sum of another - column. - - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "c"], - ... "b": [1, 2, 1, 3, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 2 │ - │ b ┆ 5 │ - │ c ┆ 3 │ - └─────┴─────┘ - - Set `maintain_order=True` to ensure the order of the groups is consistent with - the input. - - >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() - shape: (3, 2) - ┌─────┬───────────┐ - │ a ┆ c │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════╪═══════════╡ - │ a ┆ [5, 3] │ - │ b ┆ [4, 2] │ - │ c ┆ [1] │ - └─────┴───────────┘ - - Group by multiple columns by passing a list of column names. - - >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - - Or use positional arguments to group by multiple columns in the same way. - Expressions are also accepted. - - >>> lf.group_by("a", pl.col("b") // 2).agg( - ... pl.col("c").mean() - ... ).collect() # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ f64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 0 ┆ 4.0 │ - │ b ┆ 1 ┆ 3.0 │ - │ c ┆ 1 ┆ 1.0 │ - └─────┴─────┴─────┘ - - ''' - def rolling(self, index_column: IntoExpr) -> LazyGroupBy: - ''' - Create rolling groups based on a time, Int32, or Int64 column. - - Different from a `dynamic_group_by` the windows are now determined by the - individual values and are not of constant intervals. For constant intervals - use :func:`LazyFrame.group_by_dynamic`. - - If you have a time series ``, then by default the - windows created will be - - * (t_0 - period, t_0] - * (t_1 - period, t_1] - * ... - * (t_n - period, t_n] - - whereas if you pass a non-default `offset`, then the windows will be - - * (t_0 + offset, t_0 + offset + period] - * (t_1 + offset, t_1 + offset + period] - * ... - * (t_n + offset, t_n + offset + period] - - The `period` and `offset` arguments are created either from a timedelta, or - by using the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a rolling operation on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - group_by_dynamic - - Examples - -------- - >>> dates = [ - ... "2020-01-01 13:45:48", - ... "2020-01-01 16:42:13", - ... "2020-01-01 16:45:09", - ... "2020-01-02 18:12:48", - ... "2020-01-03 19:45:32", - ... "2020-01-08 23:16:43", - ... ] - >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( - ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() - ... ) - >>> out = ( - ... df.rolling(index_column="dt", period="2d") - ... .agg( - ... pl.sum("a").alias("sum_a"), - ... pl.min("a").alias("min_a"), - ... pl.max("a").alias("max_a"), - ... ) - ... .collect() - ... ) - >>> out - shape: (6, 4) - ┌─────────────────────┬───────┬───────┬───────┐ - │ dt ┆ sum_a ┆ min_a ┆ max_a │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞═════════════════════╪═══════╪═══════╪═══════╡ - │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ - │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ - │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ - │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ - │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ - │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ - └─────────────────────┴───────┴───────┴───────┘ - - ''' - def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - Time windows are calculated and rows are assigned to windows. Different from a - normal group by is that a row can be member of multiple groups. - By default, the windows look like: - - - [start, start + period) - - [start + every, start + every + period) - - [start + 2*every, start + 2*every + period) - - ... - - where `start` is determined by `start_by`, `offset`, and `every` (see parameter - descriptions below). - - .. warning:: - The index column must be sorted in ascending order. If `by` is passed, then - the index column must be sorted in ascending order within each group. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - - .. deprecated:: 0.19.4 - Use `label` instead. - include_boundaries - Add the lower and upper bound of the window to the "_lower_boundary" and - "_upper_boundary" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - label : {\'left\', \'right\', \'datapoint\'} - Define which label to use for the window: - - - \'left\': lower boundary of the window - - \'right\': upper boundary of the window - - \'datapoint\': the first value of the index column in the given window. - If you don\'t need the label to be at one of the boundaries, choose this - option for maximum performance - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - rolling - - Notes - ----- - 1) If you\'re coming from pandas, then - - .. code-block:: python - - # polars - df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) - - is equivalent to - - .. code-block:: python - - # pandas - df.set_index("ts").resample("D")["value"].sum().reset_index() - - though note that, unlike pandas, polars doesn\'t add extra rows for empty - windows. If you need `index_column` to be evenly spaced, then please combine - with :func:`DataFrame.upsample`. - - 2) The `every`, `period` and `offset` arguments are created with - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a group_by_dynamic on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Examples - -------- - >>> from datetime import datetime - >>> lf = pl.LazyFrame( - ... { - ... "time": pl.datetime_range( - ... start=datetime(2021, 12, 16), - ... end=datetime(2021, 12, 16, 3), - ... interval="30m", - ... eager=True, - ... ), - ... "n": range(7), - ... } - ... ) - >>> lf.collect() - shape: (7, 2) - ┌─────────────────────┬─────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i64 │ - ╞═════════════════════╪═════╡ - │ 2021-12-16 00:00:00 ┆ 0 │ - │ 2021-12-16 00:30:00 ┆ 1 │ - │ 2021-12-16 01:00:00 ┆ 2 │ - │ 2021-12-16 01:30:00 ┆ 3 │ - │ 2021-12-16 02:00:00 ┆ 4 │ - │ 2021-12-16 02:30:00 ┆ 5 │ - │ 2021-12-16 03:00:00 ┆ 6 │ - └─────────────────────┴─────┘ - - Group by windows of 1 hour starting at 2021-12-16 00:00:00. - - >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( - ... pl.col("n") - ... ).collect() - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [1, 2] │ - │ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ 2021-12-16 02:00:00 ┆ [5, 6] │ - └─────────────────────┴───────────┘ - - The window boundaries can also be added to the aggregation result - - >>> lf.group_by_dynamic( - ... "time", every="1h", include_boundaries=True, closed="right" - ... ).agg(pl.col("n").mean()).collect() - shape: (4, 4) - ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ - │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ - ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ - │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ - │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ - │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ - │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ - └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ - - When closed="left", the window excludes the right end of interval: - [lower_bound, upper_bound) - - >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( - ... pl.col("n") - ... ).collect() - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-16 00:00:00 ┆ [0, 1] │ - │ 2021-12-16 01:00:00 ┆ [2, 3] │ - │ 2021-12-16 02:00:00 ┆ [4, 5] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - When closed="both" the time values at the window boundaries belong to 2 groups. - - >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( - ... pl.col("n") - ... ).collect() - shape: (5, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ - │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - Dynamic group bys can also be combined with grouping on normal keys - - >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) - >>> lf.collect() - shape: (7, 3) - ┌─────────────────────┬─────┬────────┐ - │ time ┆ n ┆ groups │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ str │ - ╞═════════════════════╪═════╪════════╡ - │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ - │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ - │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ - │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ - │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ - │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ - │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ - └─────────────────────┴─────┴────────┘ - >>> lf.group_by_dynamic( - ... "time", - ... every="1h", - ... closed="both", - ... by="groups", - ... include_boundaries=True, - ... ).agg(pl.col("n")).collect() - shape: (7, 5) - ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ - │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ - ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ - │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ - │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ - │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ - │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ - │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ - └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ - - Dynamic group by on an index column - - >>> lf = pl.LazyFrame( - ... { - ... "idx": pl.int_range(0, 6, eager=True), - ... "A": ["A", "A", "B", "B", "B", "C"], - ... } - ... ) - >>> lf.group_by_dynamic( - ... "idx", - ... every="2i", - ... period="3i", - ... include_boundaries=True, - ... closed="right", - ... ).agg(pl.col("A").alias("A_agg_list")).collect() - shape: (4, 4) - ┌─────────────────┬─────────────────┬─────┬─────────────────┐ - │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 ┆ list[str] │ - ╞═════════════════╪═════════════════╪═════╪═════════════════╡ - │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ - │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ - │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ - │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ - └─────────────────┴─────────────────┴─────┴─────────────────┘ - - ''' - def join_asof(self, other: LazyFrame) -> Self: - ''' - Perform an asof join. - - This is similar to a left-join except that we match on nearest key rather than - equal keys. - - Both DataFrames must be sorted by the join_asof key. - - For each row in the left DataFrame: - - - A "backward" search selects the last row in the right DataFrame whose - \'on\' key is less than or equal to the left\'s key. - - - A "forward" search selects the first row in the right DataFrame whose - \'on\' key is greater than or equal to the left\'s key. - - A "nearest" search selects the last row in the right DataFrame whose value - is nearest to the left\'s key. String keys are not currently supported for a - nearest search. - - The default is "backward". - - Parameters - ---------- - other - Lazy DataFrame to join with. - left_on - Join column of the left DataFrame. - right_on - Join column of the right DataFrame. - on - Join column of both DataFrames. If set, `left_on` and `right_on` should be - None. - by - Join on these columns before doing asof join. - by_left - Join on these columns before doing asof join. - by_right - Join on these columns before doing asof join. - strategy : {\'backward\', \'forward\', \'nearest\'} - Join strategy. - suffix - Suffix to append to columns with a duplicate name. - tolerance - Numeric tolerance. By setting this the join will only be done if the near - keys are within this distance. If an asof join is done on columns of dtype - "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta - object or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - allow_parallel - Allow the physical plan to optionally evaluate the computation of both - DataFrames up to the join in parallel. - force_parallel - Force the physical plan to evaluate the computation of both DataFrames up to - the join in parallel. - - Examples - -------- - >>> from datetime import datetime - >>> gdp = pl.LazyFrame( - ... { - ... "date": [ - ... datetime(2016, 1, 1), - ... datetime(2017, 1, 1), - ... datetime(2018, 1, 1), - ... datetime(2019, 1, 1), - ... ], # note record date: Jan 1st (sorted!) - ... "gdp": [4164, 4411, 4566, 4696], - ... } - ... ).set_sorted("date") - >>> population = pl.LazyFrame( - ... { - ... "date": [ - ... datetime(2016, 5, 12), - ... datetime(2017, 5, 12), - ... datetime(2018, 5, 12), - ... datetime(2019, 5, 12), - ... ], # note record date: May 12th (sorted!) - ... "population": [82.19, 82.66, 83.12, 83.52], - ... } - ... ).set_sorted("date") - >>> population.join_asof(gdp, on="date", strategy="backward").collect() - shape: (4, 3) - ┌─────────────────────┬────────────┬──────┐ - │ date ┆ population ┆ gdp │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ f64 ┆ i64 │ - ╞═════════════════════╪════════════╪══════╡ - │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ - │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ - │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ - │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ - └─────────────────────┴────────────┴──────┘ - - ''' - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: - ''' - Add a join operation to the Logical Plan. - - Parameters - ---------- - other - Lazy DataFrame to join with. - on - Join column of both DataFrames. If set, `left_on` and `right_on` should be - None. - how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} - Join strategy. - - .. note:: - A left join preserves the row order of the left DataFrame. - left_on - Join column of the left DataFrame. - right_on - Join column of the right DataFrame. - suffix - Suffix to append to columns with a duplicate name. - validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} - Checks if join is of specified type. - - * *many_to_many* - “m:m”: default, does not result in checks - * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets - * *one_to_many* - “1:m”: check if join keys are unique in left dataset - * *many_to_one* - “m:1”: check if join keys are unique in right dataset - - .. note:: - - - This is currently not supported the streaming engine. - - This is only supported when joined by single columns. - allow_parallel - Allow the physical plan to optionally evaluate the computation of both - DataFrames up to the join in parallel. - force_parallel - Force the physical plan to evaluate the computation of both DataFrames up to - the join in parallel. - - See Also - -------- - join_asof - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> other_lf = pl.LazyFrame( - ... { - ... "apple": ["x", "y", "z"], - ... "ham": ["a", "b", "d"], - ... } - ... ) - >>> lf.join(other_lf, on="ham").collect() - shape: (2, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - └─────┴─────┴─────┴───────┘ - >>> lf.join(other_lf, on="ham", how="outer").collect() - shape: (4, 4) - ┌──────┬──────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞══════╪══════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ null ┆ null ┆ d ┆ z │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └──────┴──────┴─────┴───────┘ - >>> lf.join(other_lf, on="ham", how="left").collect() - shape: (3, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └─────┴─────┴─────┴───────┘ - >>> lf.join(other_lf, on="ham", how="semi").collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 7.0 ┆ b │ - └─────┴─────┴─────┘ - >>> lf.join(other_lf, on="ham", how="anti").collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - ''' - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - LazyFrame - A new LazyFrame with the columns added. - - Notes - ----- - Creating a new LazyFrame using this method does not create a new copy of - existing data. - - Examples - -------- - Pass an expression to add it as a new column. - - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() - shape: (4, 4) - ┌─────┬──────┬───────┬──────┐ - │ a ┆ b ┆ c ┆ a^2 │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 │ - ╞═════╪══════╪═══════╪══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ - └─────┴──────┴───────┴──────┘ - - Added columns will replace existing columns with the same name. - - >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() - shape: (4, 3) - ┌─────┬──────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╡ - │ 1.0 ┆ 0.5 ┆ true │ - │ 2.0 ┆ 4.0 ┆ true │ - │ 3.0 ┆ 10.0 ┆ false │ - │ 4.0 ┆ 13.0 ┆ true │ - └─────┴──────┴───────┘ - - Multiple columns can be added by passing a list of expressions. - - >>> lf.with_columns( - ... [ - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ] - ... ).collect() - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Multiple columns also can be added using positional arguments instead of a list. - - >>> lf.with_columns( - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ).collect() - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> lf.with_columns( - ... ab=pl.col("a") * pl.col("b"), - ... not_c=pl.col("c").not_(), - ... ).collect() - shape: (4, 5) - ┌─────┬──────┬───────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ ab ┆ not_c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ - └─────┴──────┴───────┴──────┴───────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... lf.drop("c").with_columns( - ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), - ... ).collect() - ... - shape: (4, 3) - ┌─────┬──────┬─────────────┐ - │ a ┆ b ┆ diffs │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ struct[2] │ - ╞═════╪══════╪═════════════╡ - │ 1 ┆ 0.5 ┆ {null,null} │ - │ 2 ┆ 4.0 ┆ {1,3.5} │ - │ 3 ┆ 10.0 ┆ {1,6.0} │ - │ 4 ┆ 13.0 ┆ {1,3.0} │ - └─────┴──────┴─────────────┘ - - ''' - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - """ - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - LazyFrame - A new LazyFrame with the columns added. - - See Also - -------- - with_columns - - """ - def with_context(self, other: Self | list[Self]) -> Self: - ''' - Add an external context to the computation graph. - - This allows expressions to also access columns from DataFrames - that are not part of this one. - - Parameters - ---------- - other - Lazy DataFrame to join with. - - Examples - -------- - >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) - >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) - >>> lf.with_context(lf_other).select( - ... pl.col("b") + pl.col("c").first() - ... ).collect() - shape: (3, 1) - ┌──────┐ - │ b │ - │ --- │ - │ str │ - ╞══════╡ - │ afoo │ - │ cfoo │ - │ null │ - └──────┘ - - Fill nulls with the median from another DataFrame: - - >>> train_lf = pl.LazyFrame( - ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} - ... ) - >>> test_lf = pl.LazyFrame( - ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} - ... ) - >>> test_lf.with_context( - ... train_lf.select(pl.all().name.suffix("_train")) - ... ).select( - ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) - ... ).collect() - shape: (3, 1) - ┌───────────┐ - │ feature_0 │ - │ --- │ - │ f64 │ - ╞═══════════╡ - │ -1.0 │ - │ 0.0 │ - │ 1.0 │ - └───────────┘ - - ''' - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: - ''' - Remove columns from the DataFrame. - - Parameters - ---------- - columns - Name of the column(s) that should be removed from the DataFrame. - *more_columns - Additional columns to drop, specified as positional arguments. - - Examples - -------- - Drop a single column by passing the name of that column. - - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.drop("ham").collect() - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪═════╡ - │ 1 ┆ 6.0 │ - │ 2 ┆ 7.0 │ - │ 3 ┆ 8.0 │ - └─────┴─────┘ - - Drop multiple columns by passing a selector. - - >>> import polars.selectors as cs - >>> lf.drop(cs.numeric()).collect() - shape: (3, 1) - ┌─────┐ - │ ham │ - │ --- │ - │ str │ - ╞═════╡ - │ a │ - │ b │ - │ c │ - └─────┘ - - Use positional arguments to drop multiple columns. - - >>> lf.drop("foo", "ham").collect() - shape: (3, 1) - ┌─────┐ - │ bar │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 6.0 │ - │ 7.0 │ - │ 8.0 │ - └─────┘ - - ''' - def rename(self, mapping: dict[str, str]) -> Self: - ''' - Rename column names. - - Parameters - ---------- - mapping - Key value pairs that map from old name to new name. - - Notes - ----- - If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), - polars will block projection and predicate pushdowns at this node. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.rename({"foo": "apple"}).collect() - shape: (3, 3) - ┌───────┬─────┬─────┐ - │ apple ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═══════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └───────┴─────┴─────┘ - - ''' - def reverse(self) -> Self: - ''' - Reverse the DataFrame. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "key": ["a", "b", "c"], - ... "val": [1, 2, 3], - ... } - ... ) - >>> lf.reverse().collect() - shape: (3, 2) - ┌─────┬─────┐ - │ key ┆ val │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ c ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 1 │ - └─────┴─────┘ - - ''' - def shift(self, n: int | IntoExprColumn = ...) -> Self: - ''' - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. Accepts expression input. - Non-expression inputs are parsed as literals. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [5, 6, 7, 8], - ... } - ... ) - >>> lf.shift().collect() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ null ┆ null │ - │ 1 ┆ 5 │ - │ 2 ┆ 6 │ - │ 3 ┆ 7 │ - └──────┴──────┘ - - Pass a negative value to shift in the opposite direction instead. - - >>> lf.shift(-2).collect() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ null ┆ null │ - │ null ┆ null │ - └──────┴──────┘ - - Specify `fill_value` to fill the resulting null values. - - >>> lf.shift(-2, fill_value=100).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ 100 ┆ 100 │ - │ 100 ┆ 100 │ - └─────┴─────┘ - - ''' - def slice(self, offset: int, length: int | None = ...) -> Self: - ''' - Get a slice of this DataFrame. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["x", "y", "z"], - ... "b": [1, 3, 5], - ... "c": [2, 4, 6], - ... } - ... ) - >>> lf.slice(1, 2).collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ y ┆ 3 ┆ 4 │ - │ z ┆ 5 ┆ 6 │ - └─────┴─────┴─────┘ - - ''' - def limit(self, n: int = ...) -> Self: - ''' - Get the first `n` rows. - - Alias for :func:`LazyFrame.head`. - - Parameters - ---------- - n - Number of rows to return. - - Notes - ----- - Consider using the :func:`fetch` operation if you only want to test your - query. The :func:`fetch` operation will load the first `n` rows at the scan - level, whereas the :func:`head`/:func:`limit` are applied at the end. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4, 5, 6], - ... "b": [7, 8, 9, 10, 11, 12], - ... } - ... ) - >>> lf.limit().collect() - shape: (5, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - │ 4 ┆ 10 │ - │ 5 ┆ 11 │ - └─────┴─────┘ - >>> lf.limit(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - └─────┴─────┘ - - ''' - def head(self, n: int = ...) -> Self: - ''' - Get the first `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Notes - ----- - Consider using the :func:`fetch` operation if you only want to test your - query. The :func:`fetch` operation will load the first `n` rows at the scan - level, whereas the :func:`head`/:func:`limit` are applied at the end. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4, 5, 6], - ... "b": [7, 8, 9, 10, 11, 12], - ... } - ... ) - >>> lf.head().collect() - shape: (5, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - │ 4 ┆ 10 │ - │ 5 ┆ 11 │ - └─────┴─────┘ - >>> lf.head(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - └─────┴─────┘ - - ''' - def tail(self, n: int = ...) -> Self: - ''' - Get the last `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4, 5, 6], - ... "b": [7, 8, 9, 10, 11, 12], - ... } - ... ) - >>> lf.tail().collect() - shape: (5, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - │ 4 ┆ 10 │ - │ 5 ┆ 11 │ - │ 6 ┆ 12 │ - └─────┴─────┘ - >>> lf.tail(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 5 ┆ 11 │ - │ 6 ┆ 12 │ - └─────┴─────┘ - - ''' - def last(self) -> Self: - ''' - Get the last row of the DataFrame. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> lf.last().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 5 ┆ 6 │ - └─────┴─────┘ - - ''' - def first(self) -> Self: - ''' - Get the first row of the DataFrame. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> lf.first().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 2 │ - └─────┴─────┘ - - ''' - def approx_n_unique(self) -> Self: - ''' - Approximate count of unique values. - - This is done using the HyperLogLog++ algorithm for cardinality estimation. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.approx_n_unique().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def approx_unique(self) -> Self: - """ - Approximate count of unique values. - - .. deprecated:: 0.18.12 - This method has been renamed to :func:`LazyFrame.approx_n_unique`. - - """ - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: - ''' - Add a column at index 0 that counts the rows. - - Parameters - ---------- - name - Name of the column to add. - offset - Start the row count at this offset. - - Warnings - -------- - This can have a negative effect on query performance. - This may, for instance, block predicate pushdown optimization. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> lf.with_row_count().collect() - shape: (3, 3) - ┌────────┬─────┬─────┐ - │ row_nr ┆ a ┆ b │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ i64 ┆ i64 │ - ╞════════╪═════╪═════╡ - │ 0 ┆ 1 ┆ 2 │ - │ 1 ┆ 3 ┆ 4 │ - │ 2 ┆ 5 ┆ 6 │ - └────────┴─────┴─────┘ - - ''' - def gather_every(self, n: int) -> Self: - ''' - Take every nth row in the LazyFrame and return as a new LazyFrame. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [5, 6, 7, 8], - ... } - ... ) - >>> lf.gather_every(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 5 │ - │ 3 ┆ 7 │ - └─────┴─────┘ - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: - ''' - Fill null values using the specified value or strategy. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - matches_supertype - Fill all matching supertypes of the fill `value` literal. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, None, 4], - ... "b": [0.5, 4, None, 13], - ... } - ... ) - >>> lf.fill_null(99).collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 99 ┆ 99.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - >>> lf.fill_null(strategy="forward").collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> lf.fill_null(strategy="max").collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> lf.fill_null(strategy="zero").collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 0 ┆ 0.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - ''' - def fill_nan(self, value: int | float | Expr | None) -> Self: - ''' - Fill floating point NaN values. - - Parameters - ---------- - value - Value to fill the NaN values with. - - Warnings - -------- - Note that floating point NaN (Not a Number) are not missing values! - To replace missing values, use :func:`fill_null` instead. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1.5, 2, float("nan"), 4], - ... "b": [0.5, 4, float("nan"), 13], - ... } - ... ) - >>> lf.fill_nan(99).collect() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪══════╡ - │ 1.5 ┆ 0.5 │ - │ 2.0 ┆ 4.0 │ - │ 99.0 ┆ 99.0 │ - │ 4.0 ┆ 13.0 │ - └──────┴──────┘ - - ''' - def std(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns in the LazyFrame to their standard deviation value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.std().collect() - shape: (1, 2) - ┌──────────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════════╪═════╡ - │ 1.290994 ┆ 0.5 │ - └──────────┴─────┘ - >>> lf.std(ddof=0).collect() - shape: (1, 2) - ┌──────────┬──────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════════╪══════════╡ - │ 1.118034 ┆ 0.433013 │ - └──────────┴──────────┘ - - ''' - def var(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns in the LazyFrame to their variance value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.var().collect() - shape: (1, 2) - ┌──────────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════════╪══════╡ - │ 1.666667 ┆ 0.25 │ - └──────────┴──────┘ - >>> lf.var(ddof=0).collect() - shape: (1, 2) - ┌──────┬────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪════════╡ - │ 1.25 ┆ 0.1875 │ - └──────┴────────┘ - - ''' - def max(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their maximum value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.max().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def min(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their minimum value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.min().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 1 │ - └─────┴─────┘ - - ''' - def sum(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their sum value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.sum().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 10 ┆ 5 │ - └─────┴─────┘ - - ''' - def mean(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their mean value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.mean().collect() - shape: (1, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════╡ - │ 2.5 ┆ 1.25 │ - └─────┴──────┘ - - ''' - def median(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their median value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.median().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 2.5 ┆ 1.0 │ - └─────┴─────┘ - - ''' - def null_count(self) -> Self: - ''' - Aggregate the columns in the LazyFrame as the sum of their null value count. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, None, 3], - ... "bar": [6, 7, None], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.null_count().collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ u32 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 1 ┆ 0 │ - └─────┴─────┴─────┘ - - ''' - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: - ''' - Aggregate the columns in the LazyFrame to their quantile value. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.quantile(0.7).collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 3.0 ┆ 1.0 │ - └─────┴─────┘ - - ''' - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: - ''' - Explode the DataFrame to long format by exploding the given columns. - - Parameters - ---------- - columns - Column names, expressions, or a selector defining them. The underlying - columns being exploded must be of List or Utf8 datatype. - *more_columns - Additional names of columns to explode, specified as positional arguments. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "letters": ["a", "a", "b", "c"], - ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], - ... } - ... ) - >>> lf.explode("numbers").collect() - shape: (8, 2) - ┌─────────┬─────────┐ - │ letters ┆ numbers │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════════╪═════════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ a ┆ 3 │ - │ b ┆ 4 │ - │ b ┆ 5 │ - │ c ┆ 6 │ - │ c ┆ 7 │ - │ c ┆ 8 │ - └─────────┴─────────┘ - - ''' - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: - ''' - Drop duplicate rows from this DataFrame. - - Parameters - ---------- - subset - Column name(s) or selector(s), to consider when identifying - duplicate rows. If set to `None` (default), use all columns. - keep : {\'first\', \'last\', \'any\', \'none\'} - Which of the duplicate rows to keep. - - * \'any\': Does not give any guarantee of which row is kept. - This allows more optimizations. - * \'none\': Don\'t keep duplicate rows. - * \'first\': Keep first unique row. - * \'last\': Keep last unique row. - maintain_order - Keep the same order as the original DataFrame. This is more expensive to - compute. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - Returns - ------- - LazyFrame - LazyFrame with unique rows. - - Warnings - -------- - This method will fail if there is a column of type `List` in the DataFrame or - subset. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3, 1], - ... "bar": ["a", "a", "a", "a"], - ... "ham": ["b", "b", "b", "b"], - ... } - ... ) - >>> lf.unique(maintain_order=True).collect() - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> lf.unique(keep="last", maintain_order=True).collect() - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - - ''' - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: - ''' - Drop all rows that contain null values. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - subset - Column name(s) for which null values are considered. - If set to `None` (default), use all columns. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, None, 8], - ... "ham": ["a", "b", None], - ... } - ... ) - - The default behavior of this method is to drop rows where any single - value of the row is null. - - >>> lf.drop_nulls().collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - This behaviour can be constrained to consider only a subset of columns, as - defined by name or with a selector. For example, dropping rows if there is - a null in any of the integer columns: - - >>> import polars.selectors as cs - >>> lf.drop_nulls(subset=cs.integer()).collect() - shape: (2, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ null │ - └─────┴─────┴──────┘ - - This method drops a row if any single value of the row is null. - - Below are some example snippets that show how you could drop null - values based on other conditions: - - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, None, None, None], - ... "b": [1, 2, None, 1], - ... "c": [1, None, None, 1], - ... } - ... ) - >>> lf.collect() - shape: (4, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪══════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ null ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴──────┴──────┘ - - Drop a row only if all values are null: - - >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() - shape: (3, 3) - ┌──────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪═════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴─────┴──────┘ - - ''' - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: - ''' - Unpivot a DataFrame from wide to long format. - - Optionally leaves identifiers set. - - This function is useful to massage a DataFrame into a format where one or more - columns are identifier variables (id_vars) while all other columns, considered - measured variables (value_vars), are "unpivoted" to the row axis leaving just - two non-identifier columns, \'variable\' and \'value\'. - - Parameters - ---------- - id_vars - Column(s) or selector(s) to use as identifier variables. - value_vars - Column(s) or selector(s) to use as values variables; if `value_vars` - is empty all columns that are not in `id_vars` will be used. - variable_name - Name to give to the `variable` column. Defaults to "variable" - value_name - Name to give to the `value` column. Defaults to "value" - streamable - Allow this node to run in the streaming engine. - If this runs in streaming, the output of the melt operation - will not have a stable ordering. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["x", "y", "z"], - ... "b": [1, 3, 5], - ... "c": [2, 4, 6], - ... } - ... ) - >>> import polars.selectors as cs - >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() - shape: (6, 3) - ┌─────┬──────────┬───────┐ - │ a ┆ variable ┆ value │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 │ - ╞═════╪══════════╪═══════╡ - │ x ┆ b ┆ 1 │ - │ y ┆ b ┆ 3 │ - │ z ┆ b ┆ 5 │ - │ x ┆ c ┆ 2 │ - │ y ┆ c ┆ 4 │ - │ z ┆ c ┆ 6 │ - └─────┴──────────┴───────┘ - - ''' - def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: - ''' - Apply a custom function. - - It is important that the function returns a Polars DataFrame. - - Parameters - ---------- - function - Lambda/ function to apply. - predicate_pushdown - Allow predicate pushdown optimization to pass this node. - projection_pushdown - Allow projection pushdown optimization to pass this node. - slice_pushdown - Allow slice pushdown optimization to pass this node. - no_optimizations - Turn off all optimizations past this point. - schema - Output schema of the function, if set to `None` we assume that the schema - will remain unchanged by the applied function. - validate_output_schema - It is paramount that polars\' schema is correct. This flag will ensure that - the output schema of this function will be checked with the expected schema. - Setting this to `False` will not do this check, but may lead to hard to - debug bugs. - streamable - Whether the function that is given is eligible to be running with the - streaming engine. That means that the function must produce the same result - when it is executed in batches or when it is be executed on the full - dataset. - - Warnings - -------- - The `schema` of a `LazyFrame` must always be correct. It is up to the caller - of this function to ensure that this invariant is upheld. - - It is important that the optimization flags are correct. If the custom function - for instance does an aggregation of a column, `predicate_pushdown` should not - be allowed, as this prunes rows and will influence your aggregation results. - - Examples - -------- - >>> lf = ( # doctest: +SKIP - ... pl.LazyFrame( - ... { - ... "a": pl.int_range(-100_000, 0, eager=True), - ... "b": pl.int_range(0, 100_000, eager=True), - ... } - ... ) - ... .map_batches(lambda x: 2 * x, streamable=True) - ... .collect(streaming=True) - ... ) - shape: (100_000, 2) - ┌─────────┬────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════════╪════════╡ - │ -200000 ┆ 0 │ - │ -199998 ┆ 2 │ - │ -199996 ┆ 4 │ - │ -199994 ┆ 6 │ - │ … ┆ … │ - │ -8 ┆ 199992 │ - │ -6 ┆ 199994 │ - │ -4 ┆ 199996 │ - │ -2 ┆ 199998 │ - └─────────┴────────┘ - - ''' - def interpolate(self) -> Self: - ''' - Interpolate intermediate values. The interpolation method is linear. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, None, 9, 10], - ... "bar": [6, 7, 9, None], - ... "baz": [1, None, None, 9], - ... } - ... ) - >>> lf.interpolate().collect() - shape: (4, 3) - ┌──────┬──────┬──────────┐ - │ foo ┆ bar ┆ baz │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 │ - ╞══════╪══════╪══════════╡ - │ 1.0 ┆ 6.0 ┆ 1.0 │ - │ 5.0 ┆ 7.0 ┆ 3.666667 │ - │ 9.0 ┆ 9.0 ┆ 6.333333 │ - │ 10.0 ┆ null ┆ 9.0 │ - └──────┴──────┴──────────┘ - - ''' - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: - ''' - Decompose struct columns into separate columns for each of their fields. - - The new columns will be inserted into the DataFrame at the location of the - struct column. - - Parameters - ---------- - columns - Name of the struct column(s) that should be unnested. - *more_columns - Additional columns to unnest, specified as positional arguments. - - Examples - -------- - >>> df = pl.LazyFrame( - ... { - ... "before": ["foo", "bar"], - ... "t_a": [1, 2], - ... "t_b": ["a", "b"], - ... "t_c": [True, None], - ... "t_d": [[1, 2], [3]], - ... "after": ["baz", "womp"], - ... } - ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") - >>> df.collect() - shape: (2, 3) - ┌────────┬─────────────────────┬───────┐ - │ before ┆ t_struct ┆ after │ - │ --- ┆ --- ┆ --- │ - │ str ┆ struct[4] ┆ str │ - ╞════════╪═════════════════════╪═══════╡ - │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ - │ bar ┆ {2,"b",null,[3]} ┆ womp │ - └────────┴─────────────────────┴───────┘ - >>> df.unnest("t_struct").collect() - shape: (2, 6) - ┌────────┬─────┬─────┬──────┬───────────┬───────┐ - │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ - ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ - │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ - │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ - └────────┴─────┴─────┴──────┴───────────┴───────┘ - - ''' - def merge_sorted(self, other: LazyFrame, key: str) -> Self: - ''' - Take two sorted DataFrames and merge them by the sorted key. - - The output of this operation will also be sorted. - It is the callers responsibility that the frames are sorted - by that key otherwise the output will not make sense. - - The schemas of both LazyFrames must be equal. - - Parameters - ---------- - other - Other DataFrame that must be merged - key - Key that is sorted. - - Examples - -------- - >>> df0 = pl.LazyFrame( - ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} - ... ).sort("age") - >>> df0.collect() - shape: (3, 2) - ┌───────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═══════╪═════╡ - │ bob ┆ 18 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └───────┴─────┘ - >>> df1 = pl.LazyFrame( - ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} - ... ).sort("age") - >>> df1.collect() - shape: (4, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - └────────┴─────┘ - >>> df0.merge_sorted(df1, key="age").collect() - shape: (7, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ bob ┆ 18 │ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └────────┴─────┘ - ''' - def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: - """ - Indicate that one or multiple columns are sorted. - - Parameters - ---------- - column - Columns that are sorted - more_columns - Additional columns that are sorted, specified as positional arguments. - descending - Whether the columns are sorted in descending order. - """ - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: - ''' - Update the values in this `LazyFrame` with the non-null values in `other`. - - Parameters - ---------- - other - LazyFrame that will be used to update the values - on - Column names that will be joined on; if given `None` the implicit row - index is used as a join key instead. - left_on - Join column(s) of the left DataFrame. - right_on - Join column(s) of the right DataFrame. - how : {\'left\', \'inner\', \'outer\'} - * \'left\' will keep all rows from the left table; rows may be duplicated - if multiple rows in the right frame match the left row\'s key. - * \'inner\' keeps only those rows where the key exists in both frames. - * \'outer\' will update existing rows where the key matches while also - adding any new rows contained in the given frame. - include_nulls - If True, null values from the right DataFrame will be used to update the - left DataFrame. - - Notes - ----- - This is syntactic sugar for a left/inner join, with an optional coalesce when - `include_nulls = False`. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "A": [1, 2, 3, 4], - ... "B": [400, 500, 600, 700], - ... } - ... ) - >>> lf.collect() - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 400 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - >>> new_lf = pl.LazyFrame( - ... { - ... "B": [-66, None, -99], - ... "C": [5, 3, 1], - ... } - ... ) - - Update `df` values with the non-null values in `new_df`, by row index: - - >>> lf.update(new_lf).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, by row index, - but only keeping those rows that are common to both frames: - - >>> lf.update(new_lf, how="inner").collect() - shape: (3, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() - shape: (5, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴─────┘ - - Update `df` values including null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> lf.update( - ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True - ... ).collect() - shape: (5, 2) - ┌─────┬──────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ null │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴──────┘ - - ''' - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: - """ - Start a group by operation. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.group_by`. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - """ - def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - """ - def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.9 - This method has been renamed to :func:`LazyFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - """ - def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.group_by_dynamic`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - include_boundaries - Add the lower and upper bound of the window to the "_lower_bound" and - "_upper_bound" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - ''' - def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: - """ - Apply a custom function. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.map_batches`. - - Parameters - ---------- - function - Lambda/ function to apply. - predicate_pushdown - Allow predicate pushdown optimization to pass this node. - projection_pushdown - Allow projection pushdown optimization to pass this node. - slice_pushdown - Allow slice pushdown optimization to pass this node. - no_optimizations - Turn off all optimizations past this point. - schema - Output schema of the function, if set to `None` we assume that the schema - will remain unchanged by the applied function. - validate_output_schema - It is paramount that polars' schema is correct. This flag will ensure that - the output schema of this function will be checked with the expected schema. - Setting this to `False` will not do this check, but may lead to hard to - debug bugs. - streamable - Whether the function that is given is eligible to be running with the - streaming engine. That means that the function must produce the same result - when it is executed in batches or when it is be executed on the full - dataset. - - """ - def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - fill None values with the result of this expression. - n - Number of places to shift (may be negative). - - """ - def take_every(self, n: int) -> Self: - """ - Take every nth row in the LazyFrame and return as a new LazyFrame. - - .. deprecated:: 0.19.0 - This method has been renamed to :meth:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - @property - def columns(self): ... - @property - def dtypes(self): ... - @property - def schema(self): ... - @property - def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..09ad18a --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/lazyframe/frame.pyi @@ -0,0 +1,4174 @@ +#: version 0.20.0 +import P +import np +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use `pl.scan_csv` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use `pl.scan_parquet` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use `pl.scan_ipc` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use `pl.scan_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to `True`. + If this is set to `True` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Materialize this LazyFrame into a DataFrame. + + By default, all query optimizations are enabled. Individual optimizations may + be disabled by setting the corresponding parameter to `False`. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + no_optimization + Turn off (certain) optimizations. + streaming + Process the query in batches to handle larger-than-memory data. + If set to `False` (default), the entire query is processed in a single + batch. + + .. warning:: + This functionality is currently in an alpha state. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + + Returns + ------- + DataFrame + + See Also + -------- + fetch: Run the query on the first `n` rows only for debugging purposes. + explain : Print the query plan that is evaluated with collect. + profile : Collect the LazyFrame and time each node in the computation graph. + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.Config.set_streaming_chunk_size : Set the size of streaming batches. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + Collect in streaming mode + + >>> lf.group_by("a").agg(pl.all().sum()).collect( + ... streaming=True + ... ) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + DataFrame directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + ... + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a Parquet file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an IPC file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a CSV file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the + separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def sink_ndjson(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_json("out.json") # doctest: +SKIP + + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that `fetch` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if `n_rows` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this LazyFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") > 1).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> lf.filter( + ... pl.col("foo") == 1, + ... pl.col("ham") == "a", + ... ).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> lf.filter(foo=1, ham="a").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Setting this to `True` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `dynamic_group_by` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.rolling(index_column="dt", period="2d") + ... .agg( + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ) + ... .collect() + ... ) + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\', \'outer_coalesce\'} + Join strategy. + + * *inner* + Returns rows that have matching values in both tables + * *left* + Returns all rows from the left table, and the matched rows from the + right table + * *outer* + Returns all rows when there is a match in either left or right table + * *outer_coalesce* + Same as \'outer\', but coalesces the key columns + * *cross* + Returns the cartisian product of rows from both tables + * *semi* + Filter rows that have a match in the right table. + * *anti* + Filter rows that not have a match in the right table. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + join_nulls + Join on null values. By default null values will never produce matches. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 5) + ┌──────┬──────┬──────┬───────┬───────────┐ + │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞══════╪══════╪══════╪═══════╪═══════════╡ + │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │ + │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │ + │ null ┆ null ┆ null ┆ z ┆ d │ + │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │ + └──────┴──────┴──────┴───────┴───────────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another DataFrame: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context( + ... train_lf.select(pl.all().name.suffix("_train")) + ... ).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the DataFrame. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the DataFrame. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), + polars will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.shift().collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> lf.shift(-2).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> lf.shift(-2, fill_value=100).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.gather_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill `value` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the DataFrame to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this DataFrame. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The `schema` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, `predicate_pushdown` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the DataFrame at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + .. warning:: + This functionality is experimental and may change without it being + considered a breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on; if given `None` the implicit row + index is used as a join key instead. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + include_nulls + If True, null values from the right DataFrame will be used to update the + left DataFrame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> lf.collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_lf = pl.LazyFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> lf.update(new_lf).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> lf.update(new_lf, how="inner").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update( + ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... ).collect() + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> Self: + """ + Take every nth row in the LazyFrame and return as a new LazyFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/series/series deleted file mode 100644 index 4a40006..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/series/series +++ /dev/null @@ -1,4988 +0,0 @@ -import np as np -import pa as pa -import pd as pd -from builtins import PySeries -from datetime import date, datetime, timedelta -from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Object as Object, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown, Utf8 as Utf8 -from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat -from polars.exceptions import ShapeError as ShapeError -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence - -TYPE_CHECKING: bool -_PYARROW_AVAILABLE: bool - -class Series: - _s: _ClassVar[None] = ... - _accessors: _ClassVar[set] = ... - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array) -> Self: - """Construct a Series from an Arrow Array.""" - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: - """Construct a Series from a pandas Series or DatetimeIndex.""" - def _get_ptr(self) -> tuple[int, int, int]: - """ - Get a pointer to the start of the values buffer of a numeric Series. - - This will raise an error if the `Series` contains multiple chunks. - - This will return the offset, length and the pointer itself. - - """ - def __bool__(self) -> NoReturn: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Series: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... - def __eq__(self, other: Any) -> Series | Expr: ... - def __ne__(self, other: Any) -> Series | Expr: ... - def __gt__(self, other: Any) -> Series | Expr: ... - def __lt__(self, other: Any) -> Series | Expr: ... - def __ge__(self, other: Any) -> Series | Expr: ... - def __le__(self, other: Any) -> Series | Expr: ... - def le(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series <= other`.""" - def lt(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series < other`.""" - def eq(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series == other`.""" - def eq_missing(self, other: Any) -> Self | Expr: - ''' - Method equivalent of equality operator `series == other` where `None == None`. - - This differs from the standard `ne` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - See Also - -------- - ne_missing - eq - - Examples - -------- - >>> s1 = pl.Series("a", [333, 200, None]) - >>> s2 = pl.Series("a", [100, 200, None]) - >>> s1.eq(s2) - shape: (3,) - Series: \'a\' [bool] - [ - false - true - null - ] - >>> s1.eq_missing(s2) - shape: (3,) - Series: \'a\' [bool] - [ - false - true - true - ] - - ''' - def ne(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series != other`.""" - def ne_missing(self, other: Any) -> Self | Expr: - ''' - Method equivalent of equality operator `series != other` where `None == None`. - - This differs from the standard `ne` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - See Also - -------- - eq_missing - ne - - Examples - -------- - >>> s1 = pl.Series("a", [333, 200, None]) - >>> s2 = pl.Series("a", [100, 200, None]) - >>> s1.ne(s2) - shape: (3,) - Series: \'a\' [bool] - [ - true - false - null - ] - >>> s1.ne_missing(s2) - shape: (3,) - Series: \'a\' [bool] - [ - true - false - false - ] - - ''' - def ge(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series >= other`.""" - def gt(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series > other`.""" - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - def __add__(self, other: Any) -> Self | DataFrame | Expr: ... - def __sub__(self, other: Any) -> Self | Expr: ... - def __truediv__(self, other: Any) -> Series | Expr: ... - def __floordiv__(self, other: Any) -> Series | Expr: ... - def __invert__(self) -> Series: ... - def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... - def __mod__(self, other: Any) -> Series | Expr: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: - """ - Numpy __array__ interface protocol. - - Ensures that `np.asarray(pl.Series(..))` works as expected, see - https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. - """ - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: - """Numpy universal functions.""" - def __column_consortium_standard__(self) -> Any: - """ - Provide entry point to the Consortium DataFrame Standard API. - - This is developed and maintained outside of polars. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. - """ - def _repr_html_(self) -> str: - """Format output data in HTML for display in Jupyter Notebooks.""" - def item(self, index: int | None = ...) -> Any: - ''' - Return the Series as a scalar, or return the element at the given index. - - If no index is provided, this is equivalent to `s[0]`, with a check - that the shape is (1,). With an index, this is equivalent to `s[index]`. - - Examples - -------- - >>> s1 = pl.Series("a", [1]) - >>> s1.item() - 1 - >>> s2 = pl.Series("a", [9, 8, 7]) - >>> s2.cum_sum().item(-1) - 24 - - ''' - def estimated_size(self, unit: SizeUnit = ...) -> int | float: - ''' - Return an estimation of the total (heap) allocated size of the Series. - - Estimated size is given in the specified unit (bytes by default). - - This estimation is the sum of the size of its buffers, validity, including - nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the - size of 2 arrays is not the sum of the sizes computed from this function. In - particular, [`StructArray`]\'s size is an upper bound. - - When an array is sliced, its allocated size remains constant because the buffer - unchanged. However, this function will yield a smaller number. This is because - this function returns the visible size of the buffer, not its total capacity. - - FFI buffers are included in this estimation. - - Parameters - ---------- - unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} - Scale the returned size to the given unit. - - Examples - -------- - >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) - >>> s.estimated_size() - 4000000 - >>> s.estimated_size("mb") - 3.814697265625 - - ''' - def sqrt(self) -> Series: - """ - Compute the square root of the elements. - - Syntactic sugar for - - >>> pl.Series([1, 2]) ** 0.5 - shape: (2,) - Series: '' [f64] - [ - 1.0 - 1.414214 - ] - - """ - def cbrt(self) -> Series: - """ - Compute the cube root of the elements. - - Optimization for - - >>> pl.Series([1, 2]) ** (1.0 / 3) - shape: (2,) - Series: '' [f64] - [ - 1.0 - 1.259921 - ] - - """ - def any(self) -> bool | None: - """ - Return whether any of the values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is `None`. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - bool or None - - Examples - -------- - >>> pl.Series([True, False]).any() - True - >>> pl.Series([False, False]).any() - False - >>> pl.Series([None, False]).any() - False - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None - - """ - def all(self) -> bool | None: - """ - Return whether all values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is `None`. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - bool or None - - Examples - -------- - >>> pl.Series([True, True]).all() - True - >>> pl.Series([False, True]).all() - False - >>> pl.Series([None, True]).all() - True - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None - - """ - def log(self, base: float = ...) -> Series: - """Compute the logarithm to a given base.""" - def log1p(self) -> Series: - """Compute the natural logarithm of the input array plus one, element-wise.""" - def log10(self) -> Series: - """Compute the base 10 logarithm of the input array, element-wise.""" - def exp(self) -> Series: - """Compute the exponential, element-wise.""" - def drop_nulls(self) -> Series: - ''' - Drop all null values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nans - - Notes - ----- - A null value is not the same as a NaN value. - To drop NaN values, use :func:`drop_nans`. - - Examples - -------- - >>> s = pl.Series([1.0, None, 3.0, float("nan")]) - >>> s.drop_nulls() - shape: (3,) - Series: \'\' [f64] - [ - 1.0 - 3.0 - NaN - ] - - ''' - def drop_nans(self) -> Series: - ''' - Drop all floating point NaN values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nulls - - Notes - ----- - A NaN value is not the same as a null value. - To drop null values, use :func:`drop_nulls`. - - Examples - -------- - >>> s = pl.Series([1.0, None, 3.0, float("nan")]) - >>> s.drop_nans() - shape: (3,) - Series: \'\' [f64] - [ - 1.0 - null - 3.0 - ] - - ''' - def to_frame(self, name: str | None = ...) -> DataFrame: - ''' - Cast this Series to a DataFrame. - - Parameters - ---------- - name - optionally name/rename the Series column in the new DataFrame. - - Examples - -------- - >>> s = pl.Series("a", [123, 456]) - >>> df = s.to_frame() - >>> df - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 123 │ - │ 456 │ - └─────┘ - - >>> df = s.to_frame("xyz") - >>> df - shape: (2, 1) - ┌─────┐ - │ xyz │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 123 │ - │ 456 │ - └─────┘ - - ''' - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: - ''' - Quick summary statistics of a Series. - - Series with mixed datatypes will return summary statistics for the datatype of - the first value. - - Parameters - ---------- - percentiles - One or more percentiles to include in the summary statistics (if the - Series has a numeric dtype). All values must be in the range `[0, 1]`. - - Notes - ----- - The median is included by default as the 50% percentile. - - Returns - ------- - DataFrame - Mapping with summary statistics of a Series. - - Examples - -------- - >>> series_num = pl.Series([1, 2, 3, 4, 5]) - >>> series_num.describe() - shape: (9, 2) - ┌────────────┬──────────┐ - │ statistic ┆ value │ - │ --- ┆ --- │ - │ str ┆ f64 │ - ╞════════════╪══════════╡ - │ count ┆ 5.0 │ - │ null_count ┆ 0.0 │ - │ mean ┆ 3.0 │ - │ std ┆ 1.581139 │ - │ min ┆ 1.0 │ - │ 25% ┆ 2.0 │ - │ 50% ┆ 3.0 │ - │ 75% ┆ 4.0 │ - │ max ┆ 5.0 │ - └────────────┴──────────┘ - - >>> series_str = pl.Series(["a", "a", None, "b", "c"]) - >>> series_str.describe() - shape: (3, 2) - ┌────────────┬───────┐ - │ statistic ┆ value │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════════╪═══════╡ - │ count ┆ 5 │ - │ null_count ┆ 1 │ - │ unique ┆ 4 │ - └────────────┴───────┘ - - ''' - def sum(self) -> int | float: - ''' - Reduce this Series to the sum value. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.sum() - 6 - - ''' - def mean(self) -> int | float | None: - ''' - Reduce this Series to the mean value. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.mean() - 2.0 - - ''' - def product(self) -> int | float: - """Reduce this Series to the product value.""" - def pow(self, exponent: int | float | None | Series) -> Series: - ''' - Raise to the power of the given exponent. - - Parameters - ---------- - exponent - The exponent. Accepts Series input. - - Examples - -------- - >>> s = pl.Series("foo", [1, 2, 3, 4]) - >>> s.pow(3) - shape: (4,) - Series: \'foo\' [f64] - [ - 1.0 - 8.0 - 27.0 - 64.0 - ] - - ''' - def min(self) -> PythonLiteral | None: - ''' - Get the minimal value in this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.min() - 1 - - ''' - def max(self) -> PythonLiteral | None: - ''' - Get the maximum value in this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.max() - 3 - - ''' - def nan_max(self) -> int | float | date | datetime | timedelta | str: - """ - Get maximum value, but propagate/poison encountered NaN values. - - This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - """ - def nan_min(self) -> int | float | date | datetime | timedelta | str: - """ - Get minimum value, but propagate/poison encountered NaN values. - - This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - """ - def std(self, ddof: int = ...) -> float | None: - ''' - Get the standard deviation of this Series. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.std() - 1.0 - - ''' - def var(self, ddof: int = ...) -> float | None: - ''' - Get variance of this Series. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.var() - 1.0 - - ''' - def median(self) -> float | None: - ''' - Get the median of this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.median() - 2.0 - - ''' - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: - ''' - Get the quantile value of this Series. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.quantile(0.5) - 2.0 - - ''' - def to_dummies(self, separator: str = ...) -> DataFrame: - ''' - Get dummy/indicator variables. - - Parameters - ---------- - separator - Separator/delimiter used when generating column names. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.to_dummies() - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a_1 ┆ a_2 ┆ a_3 │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 0 ┆ 0 │ - │ 0 ┆ 1 ┆ 0 │ - │ 0 ┆ 0 ┆ 1 │ - └─────┴─────┴─────┘ - - ''' - def cut(self, breaks: Sequence[float]) -> Series | DataFrame: - ''' - Bin continuous values into discrete categories. - - Parameters - ---------- - breaks - List of unique cut points. - labels - Names of the categories. The number of labels must be equal to the number - of cut points plus one. - break_point_label - Name of the breakpoint column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - category_label - Name of the category column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - left_closed - Set the intervals to be left-closed instead of right-closed. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - as_series - If set to `False`, return a DataFrame containing the original values, - the breakpoints, and the categories. - - .. deprecated:: 0.19.0 - This parameter will be removed. The same behavior can be achieved by - setting `include_breaks=True`, unnesting the resulting struct Series, - and adding the result to the original Series. - - Returns - ------- - Series - Series of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise a Series of data type :class:`Struct`. - - See Also - -------- - qcut - - Examples - -------- - Divide the column into three categories. - - >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) - >>> s.cut([-1, 1], labels=["a", "b", "c"]) - shape: (5,) - Series: \'foo\' [cat] - [ - "a" - "a" - "b" - "b" - "c" - ] - - Create a DataFrame with the breakpoint and category for each value. - - >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") - >>> s.to_frame().with_columns(cut).unnest("cut") - shape: (5, 3) - ┌─────┬─────────────┬────────────┐ - │ foo ┆ break_point ┆ category │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪═════════════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴─────────────┴────────────┘ - - ''' - def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: - ''' - Bin continuous values into discrete categories based on their quantiles. - - Parameters - ---------- - quantiles - Either a list of quantile probabilities between 0 and 1 or a positive - integer determining the number of bins with uniform probability. - labels - Names of the categories. The number of labels must be equal to the number - of cut points plus one. - left_closed - Set the intervals to be left-closed instead of right-closed. - allow_duplicates - If set to `True`, duplicates in the resulting quantiles are dropped, - rather than raising a `DuplicateError`. This can happen even with unique - probabilities, depending on the data. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - break_point_label - Name of the breakpoint column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - category_label - Name of the category column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - as_series - If set to `False`, return a DataFrame containing the original values, - the breakpoints, and the categories. - - .. deprecated:: 0.19.0 - This parameter will be removed. The same behavior can be achieved by - setting `include_breaks=True`, unnesting the resulting struct Series, - and adding the result to the original Series. - - Returns - ------- - Series - Series of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise a Series of data type :class:`Struct`. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - See Also - -------- - cut - - Examples - -------- - Divide a column into three categories according to pre-defined quantile - probabilities. - - >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) - >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) - shape: (5,) - Series: \'foo\' [cat] - [ - "a" - "a" - "b" - "b" - "c" - ] - - Divide a column into two categories using uniform quantile probabilities. - - >>> s.qcut(2, labels=["low", "high"], left_closed=True) - shape: (5,) - Series: \'foo\' [cat] - [ - "low" - "low" - "high" - "high" - "high" - ] - - Create a DataFrame with the breakpoint and category for each value. - - >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") - >>> s.to_frame().with_columns(cut).unnest("cut") - shape: (5, 3) - ┌─────┬─────────────┬────────────┐ - │ foo ┆ break_point ┆ category │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪═════════════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴─────────────┴────────────┘ - - ''' - def rle(self) -> Series: - ''' - Get the lengths of runs of identical values. - - Returns - ------- - Series - Series of data type :class:`Struct` with Fields "lengths" and "values". - - Examples - -------- - >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) - >>> s.rle().struct.unnest() - shape: (6, 2) - ┌─────────┬────────┐ - │ lengths ┆ values │ - │ --- ┆ --- │ - │ i32 ┆ i64 │ - ╞═════════╪════════╡ - │ 2 ┆ 1 │ - │ 1 ┆ 2 │ - │ 1 ┆ 1 │ - │ 1 ┆ null │ - │ 1 ┆ 1 │ - │ 2 ┆ 3 │ - └─────────┴────────┘ - ''' - def rle_id(self) -> Series: - ''' - Map values to run IDs. - - Similar to RLE, but it maps each value to an ID corresponding to the run into - which it falls. This is especially useful when you want to define groups by - runs of identical values rather than the values themselves. - - Returns - ------- - Series - - See Also - -------- - rle - - Examples - -------- - >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) - >>> s.rle_id() - shape: (8,) - Series: \'s\' [u32] - [ - 0 - 0 - 1 - 2 - 3 - 4 - 5 - 5 - ] - ''' - def hist(self, bins: list[float] | None = ...) -> DataFrame: - ''' - Bin values into buckets and count their occurrences. - - Parameters - ---------- - bins - Discretizations to make. - If None given, we determine the boundaries based on the data. - bin_count - If no bins provided, this will be used to determine - the distance of the bins - - Returns - ------- - DataFrame - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Examples - -------- - >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) - >>> a.hist(bin_count=4) - shape: (5, 3) - ┌─────────────┬─────────────┬─────────┐ - │ break_point ┆ category ┆ a_count │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ cat ┆ u32 │ - ╞═════════════╪═════════════╪═════════╡ - │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ - │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ - │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ - │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ - │ inf ┆ (6.75, inf] ┆ 2 │ - └─────────────┴─────────────┴─────────┘ - - ''' - def value_counts(self) -> DataFrame: - ''' - Count the occurrences of unique values. - - Parameters - ---------- - sort - Sort the output by count in descending order. - If set to `False` (default), the order of the output is random. - parallel - Execute the computation in parallel. - - .. note:: - This option should likely not be enabled in a group by context, - as the computation is already parallelized per group. - - Returns - ------- - DataFrame - Mapping of unique values to their count. - - Examples - -------- - >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) - >>> s.value_counts() # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌───────┬────────┐ - │ color ┆ counts │ - │ --- ┆ --- │ - │ str ┆ u32 │ - ╞═══════╪════════╡ - │ red ┆ 2 │ - │ green ┆ 1 │ - │ blue ┆ 3 │ - └───────┴────────┘ - - Sort the output by count. - - shape: (3, 2) - ┌───────┬────────┐ - │ color ┆ counts │ - │ --- ┆ --- │ - │ str ┆ u32 │ - ╞═══════╪════════╡ - │ blue ┆ 3 │ - │ red ┆ 2 │ - │ green ┆ 1 │ - └───────┴────────┘ - - ''' - def unique_counts(self) -> Series: - ''' - Return a count of the unique values in the order of appearance. - - Examples - -------- - >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) - >>> s.unique_counts() - shape: (3,) - Series: \'id\' [u32] - [ - 1 - 2 - 3 - ] - - ''' - def entropy(self, base: float = ...) -> float | None: - """ - Computes the entropy. - - Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. - - Parameters - ---------- - base - Given base, defaults to `e` - normalize - Normalize pk if it doesn't sum to 1. - - Examples - -------- - >>> a = pl.Series([0.99, 0.005, 0.005]) - >>> a.entropy(normalize=True) - 0.06293300616044681 - >>> b = pl.Series([0.65, 0.10, 0.25]) - >>> b.entropy(normalize=True) - 0.8568409950394724 - - """ - def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: - ''' - Run an expression over a sliding window that increases `1` slot every iteration. - - Parameters - ---------- - expr - Expression to evaluate - min_periods - Number of valid values there should be in the window before the expression - is evaluated. valid values = `length - null_count` - parallel - Run in parallel. Don\'t do this in a group by or another operation that - already has much parallelization. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - This can be really slow as it can have `O(n^2)` complexity. Don\'t use this - for operations that visit all elements. - - Examples - -------- - >>> s = pl.Series("values", [1, 2, 3, 4, 5]) - >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) - shape: (5,) - Series: \'values\' [f64] - [ - 0.0 - -3.0 - -8.0 - -15.0 - -24.0 - ] - - ''' - def alias(self, name: str) -> Series: - ''' - Rename the series. - - Parameters - ---------- - name - The new name. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.alias("b") - shape: (3,) - Series: \'b\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def rename(self, name: str) -> Series: - ''' - Rename this Series. - - Alias for :func:`Series.alias`. - - Parameters - ---------- - name - New name. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.rename("b") - shape: (3,) - Series: \'b\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def chunk_lengths(self) -> list[int]: - ''' - Get the length of each individual chunk. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("a", [4, 5, 6]) - - Concatenate Series with rechunk = True - - >>> pl.concat([s, s2]).chunk_lengths() - [6] - - Concatenate Series with rechunk = False - - >>> pl.concat([s, s2], rechunk=False).chunk_lengths() - [3, 3] - - ''' - def n_chunks(self) -> int: - ''' - Get the number of chunks that this Series contains. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.n_chunks() - 1 - >>> s2 = pl.Series("a", [4, 5, 6]) - - Concatenate Series with rechunk = True - - >>> pl.concat([s, s2]).n_chunks() - 1 - - Concatenate Series with rechunk = False - - >>> pl.concat([s, s2], rechunk=False).n_chunks() - 2 - - ''' - def cum_max(self) -> Series: - ''' - Get an array with the cumulative max computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Examples - -------- - >>> s = pl.Series("s", [3, 5, 1]) - >>> s.cum_max() - shape: (3,) - Series: \'s\' [i64] - [ - 3 - 5 - 5 - ] - - ''' - def cum_min(self) -> Series: - ''' - Get an array with the cumulative min computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Examples - -------- - >>> s = pl.Series("s", [1, 2, 3]) - >>> s.cum_min() - shape: (3,) - Series: \'s\' [i64] - [ - 1 - 1 - 1 - ] - - ''' - def cum_prod(self) -> Series: - ''' - Get an array with the cumulative product computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.cum_prod() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 6 - ] - - ''' - def cum_sum(self) -> Series: - ''' - Get an array with the cumulative sum computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.cum_sum() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 3 - 6 - ] - - ''' - def slice(self, offset: int, length: int | None = ...) -> Series: - ''' - Get a slice of this Series. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4]) - >>> s.slice(1, 2) - shape: (2,) - Series: \'a\' [i64] - [ - 2 - 3 - ] - - ''' - def append(self, other: Series) -> Self: - ''' - Append a Series to this one. - - Parameters - ---------- - other - Series to append. - append_chunks - .. deprecated:: 0.18.8 - This argument will be removed and `append` will change to always - behave like `append_chunks=True` (the previous default). For the - behavior of `append_chunks=False`, use `Series.extend`. - - If set to `True` the append operation will add the chunks from `other` to - self. This is super cheap. - - If set to `False` the append operation will do the same as - `DataFrame.extend` which extends the memory backed by this `Series` with - the values from `other`. - - Different from `append chunks`, `extend` appends the data from `other` to - the underlying memory locations and thus may cause a reallocation (which are - expensive). - - If this does not cause a reallocation, the resulting data structure will not - have any extra chunks and thus will yield faster queries. - - Prefer `extend` over `append_chunks` when you want to do a query after a - single append. For instance during online operations where you add `n` rows - and rerun a query. - - Prefer `append_chunks` over `extend` when you want to append many times - before doing a query. For instance when you read in multiple files and when - to store them in a single `Series`. In the latter case, finish the sequence - of `append_chunks` operations with a `rechunk`. - - Warnings - -------- - This method modifies the series in-place. The series is returned for - convenience only. - - See Also - -------- - extend - - Examples - -------- - >>> a = pl.Series("a", [1, 2, 3]) - >>> b = pl.Series("b", [4, 5]) - >>> a.append(b) - shape: (5,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - 5 - ] - - The resulting series will consist of multiple chunks. - - >>> a.n_chunks() - 2 - - ''' - def extend(self, other: Series) -> Self: - ''' - Extend the memory backed by this Series with the values from another. - - Different from `append`, which adds the chunks from `other` to the chunks of - this series, `extend` appends the data from `other` to the underlying memory - locations and thus may cause a reallocation (which is expensive). - - If this does `not` cause a reallocation, the resulting data structure will not - have any extra chunks and thus will yield faster queries. - - Prefer `extend` over `append` when you want to do a query after a single - append. For instance, during online operations where you add `n` rows - and rerun a query. - - Prefer `append` over `extend` when you want to append many times - before doing a query. For instance, when you read in multiple files and want - to store them in a single `Series`. In the latter case, finish the sequence - of `append` operations with a `rechunk`. - - Parameters - ---------- - other - Series to extend the series with. - - Warnings - -------- - This method modifies the series in-place. The series is returned for - convenience only. - - See Also - -------- - append - - Examples - -------- - >>> a = pl.Series("a", [1, 2, 3]) - >>> b = pl.Series("b", [4, 5]) - >>> a.extend(b) - shape: (5,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - 5 - ] - - The resulting series will consist of a single chunk. - - >>> a.n_chunks() - 1 - - ''' - def filter(self, predicate: Series | list[bool]) -> Self: - ''' - Filter elements by a boolean mask. - - The original order of the remaining elements is preserved. - - Parameters - ---------- - predicate - Boolean mask. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> mask = pl.Series("", [True, False, True]) - >>> s.filter(mask) - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 3 - ] - - ''' - def head(self, n: int = ...) -> Series: - ''' - Get the first `n` elements. - - Parameters - ---------- - n - Number of elements to return. If a negative value is passed, return all - elements except the last `abs(n)`. - - See Also - -------- - tail, slice - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.head(3) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - Pass a negative value to get all rows `except` the last `abs(n)`. - - >>> s.head(-3) - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 2 - ] - - ''' - def tail(self, n: int = ...) -> Series: - ''' - Get the last `n` elements. - - Parameters - ---------- - n - Number of elements to return. If a negative value is passed, return all - elements except the first `abs(n)`. - - See Also - -------- - head, slice - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.tail(3) - shape: (3,) - Series: \'a\' [i64] - [ - 3 - 4 - 5 - ] - - Pass a negative value to get all rows `except` the first `abs(n)`. - - >>> s.tail(-3) - shape: (2,) - Series: \'a\' [i64] - [ - 4 - 5 - ] - - ''' - def limit(self, n: int = ...) -> Series: - """ - Get the first `n` elements. - - Alias for :func:`Series.head`. - - Parameters - ---------- - n - Number of elements to return. If a negative value is passed, return all - elements except the last `abs(n)`. - - See Also - -------- - head - - """ - def gather_every(self, n: int) -> Series: - ''' - Take every nth value in the Series and return as new Series. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4]) - >>> s.gather_every(2) - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 3 - ] - - ''' - def sort(self) -> Self: - ''' - Sort this Series. - - Parameters - ---------- - descending - Sort in descending order. - in_place - Sort in-place. - - Examples - -------- - >>> s = pl.Series("a", [1, 3, 4, 2]) - >>> s.sort() - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - ] - >>> s.sort(descending=True) - shape: (4,) - Series: \'a\' [i64] - [ - 4 - 3 - 2 - 1 - ] - - ''' - def top_k(self, k: int | IntoExprColumn = ...) -> Series: - ''' - Return the `k` largest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - bottom_k - - Examples - -------- - >>> s = pl.Series("a", [2, 5, 1, 4, 3]) - >>> s.top_k(3) - shape: (3,) - Series: \'a\' [i64] - [ - 5 - 4 - 3 - ] - - ''' - def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: - ''' - Return the `k` smallest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - top_k - - Examples - -------- - >>> s = pl.Series("a", [2, 5, 1, 4, 3]) - >>> s.bottom_k(3) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def arg_sort(self) -> Series: - ''' - Get the index values that would sort this Series. - - Parameters - ---------- - descending - Sort in descending order. - nulls_last - Place null values last instead of first. - - Examples - -------- - >>> s = pl.Series("a", [5, 3, 4, 1, 2]) - >>> s.arg_sort() - shape: (5,) - Series: \'a\' [u32] - [ - 3 - 4 - 1 - 2 - 0 - ] - - ''' - def arg_unique(self) -> Series: - ''' - Get unique index as Series. - - Returns - ------- - Series - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.arg_unique() - shape: (3,) - Series: \'a\' [u32] - [ - 0 - 1 - 3 - ] - - ''' - def arg_min(self) -> int | None: - ''' - Get the index of the minimal value. - - Returns - ------- - int - - Examples - -------- - >>> s = pl.Series("a", [3, 2, 1]) - >>> s.arg_min() - 2 - - ''' - def arg_max(self) -> int | None: - ''' - Get the index of the maximal value. - - Returns - ------- - int - - Examples - -------- - >>> s = pl.Series("a", [3, 2, 1]) - >>> s.arg_max() - 0 - - ''' - def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: - """ - Find indices where elements should be inserted to maintain order. - - .. math:: a[i-1] < v <= a[i] - - Parameters - ---------- - element - Expression or scalar value. - side : {'any', 'left', 'right'} - If 'any', the index of the first suitable location found is given. - If 'left', the index of the leftmost suitable location found is given. - If 'right', return the rightmost suitable location found is given. - - """ - def unique(self) -> Series: - ''' - Get unique elements in series. - - Parameters - ---------- - maintain_order - Maintain order of data. This requires more work. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.unique().sort() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: - ''' - Take values by index. - - Parameters - ---------- - indices - Index location used for selection. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4]) - >>> s.gather([1, 3]) - shape: (2,) - Series: \'a\' [i64] - [ - 2 - 4 - ] - - ''' - def null_count(self) -> int: - """Count the null values in this Series.""" - def has_validity(self) -> bool: - """ - Return True if the Series has a validity bitmask. - - If there is no mask, it means that there are no `null` values. - - Notes - ----- - While the *absence* of a validity bitmask guarantees that a Series does not - have `null` values, the converse is not true, eg: the *presence* of a - bitmask does not mean that there are null values, as every value of the - bitmask could be `false`. - - To confirm that a column has `null` values use :func:`null_count`. - - """ - def is_empty(self) -> bool: - ''' - Check if the Series is empty. - - Examples - -------- - >>> s = pl.Series("a", [], dtype=pl.Float32) - >>> s.is_empty() - True - - ''' - def is_sorted(self) -> bool: - """ - Check if the Series is sorted. - - Parameters - ---------- - descending - Check if the Series is sorted in descending order - - """ - def not_(self) -> Series: - ''' - Negate a boolean Series. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [True, False, False]) - >>> s.not_() - shape: (3,) - Series: \'a\' [bool] - [ - false - true - true - ] - - ''' - def is_null(self) -> Series: - ''' - Returns a boolean Series indicating which values are null. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) - >>> s.is_null() - shape: (4,) - Series: \'a\' [bool] - [ - false - false - false - true - ] - - ''' - def is_not_null(self) -> Series: - ''' - Returns a boolean Series indicating which values are not null. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) - >>> s.is_not_null() - shape: (4,) - Series: \'a\' [bool] - [ - true - true - true - false - ] - - ''' - def is_finite(self) -> Series: - ''' - Returns a boolean Series indicating which values are finite. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, np.inf]) - >>> s.is_finite() - shape: (3,) - Series: \'a\' [bool] - [ - true - true - false - ] - - ''' - def is_infinite(self) -> Series: - ''' - Returns a boolean Series indicating which values are infinite. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, np.inf]) - >>> s.is_infinite() - shape: (3,) - Series: \'a\' [bool] - [ - false - false - true - ] - - ''' - def is_nan(self) -> Series: - ''' - Returns a boolean Series indicating which values are not NaN. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) - >>> s.is_nan() - shape: (4,) - Series: \'a\' [bool] - [ - false - false - false - true - ] - - ''' - def is_not_nan(self) -> Series: - ''' - Returns a boolean Series indicating which values are not NaN. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) - >>> s.is_not_nan() - shape: (4,) - Series: \'a\' [bool] - [ - true - true - true - false - ] - - ''' - def is_in(self, other: Series | Collection[Any]) -> Series: - ''' - Check if elements of this Series are in the other Series. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("b", [2, 4]) - >>> s2.is_in(s) - shape: (2,) - Series: \'b\' [bool] - [ - true - false - ] - - >>> # check if some values are a member of sublists - >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) - >>> optional_members = pl.Series("optional_members", [1, 2, 3]) - >>> print(sets) - shape: (3,) - Series: \'sets\' [list[i64]] - [ - [1, 2, 3] - [1, 2] - [9, 10] - ] - >>> print(optional_members) - shape: (3,) - Series: \'optional_members\' [i64] - [ - 1 - 2 - 3 - ] - >>> optional_members.is_in(sets) - shape: (3,) - Series: \'optional_members\' [bool] - [ - true - true - false - ] - - ''' - def arg_true(self) -> Series: - ''' - Get index values where Boolean Series evaluate True. - - Returns - ------- - Series - Series of data type :class:`UInt32`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> (s == 2).arg_true() - shape: (1,) - Series: \'a\' [u32] - [ - 1 - ] - - ''' - def is_unique(self) -> Series: - ''' - Get mask of all unique values. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.is_unique() - shape: (4,) - Series: \'a\' [bool] - [ - true - false - false - true - ] - - ''' - def is_first_distinct(self) -> Series: - """ - Return a boolean mask indicating the first occurrence of each distinct value. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series([1, 1, 2, 3, 2]) - >>> s.is_first_distinct() - shape: (5,) - Series: '' [bool] - [ - true - false - true - true - false - ] - - """ - def is_last_distinct(self) -> Series: - """ - Return a boolean mask indicating the last occurrence of each distinct value. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series([1, 1, 2, 3, 2]) - >>> s.is_last_distinct() - shape: (5,) - Series: '' [bool] - [ - false - true - false - true - true - ] - - """ - def is_duplicated(self) -> Series: - ''' - Get mask of all duplicated values. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.is_duplicated() - shape: (4,) - Series: \'a\' [bool] - [ - false - true - true - false - ] - - ''' - def explode(self) -> Series: - """ - Explode a list Series. - - This means that every item is expanded to a new row. - - Returns - ------- - Series - Series with the data type of the list elements. - - See Also - -------- - Series.list.explode : Explode a list column. - Series.str.explode : Explode a string column. - - """ - def equals(self, other: Series) -> bool: - ''' - Check whether the Series is equal to another Series. - - Parameters - ---------- - other - Series to compare with. - null_equal - Consider null values as equal. - strict - Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a - `pl.Int64` will return `False`. - - See Also - -------- - assert_series_equal - - Examples - -------- - >>> s1 = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("b", [4, 5, 6]) - >>> s1.equals(s1) - True - >>> s1.equals(s2) - False - ''' - def len(self) -> int: - ''' - Return the number of elements in this Series. - - Null values are treated like regular elements in this context. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, None]) - >>> s.len() - 3 - - ''' - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: - ''' - Cast between data types. - - Parameters - ---------- - dtype - DataType to cast to. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> s = pl.Series("a", [True, False, True]) - >>> s - shape: (3,) - Series: \'a\' [bool] - [ - true - false - true - ] - - >>> s.cast(pl.UInt32) - shape: (3,) - Series: \'a\' [u32] - [ - 1 - 0 - 1 - ] - - ''' - def to_physical(self) -> Series: - ''' - Cast to physical representation of the logical dtype. - - - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` - - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` - - `List(inner)` -> `List(physical of inner)` - - Other data types will be left unchanged. - - Examples - -------- - Replicating the pandas - `pd.Series.factorize - `_ - method. - - >>> s = pl.Series("values", ["a", None, "x", "a"]) - >>> s.cast(pl.Categorical).to_physical() - shape: (4,) - Series: \'values\' [u32] - [ - 0 - null - 1 - 0 - ] - - ''' - def to_list(self) -> list[Any]: - ''' - Convert this Series to a Python List. This operation clones data. - - Parameters - ---------- - use_pyarrow - Use pyarrow for the conversion. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.to_list() - [1, 2, 3] - >>> type(s.to_list()) - - - ''' - def rechunk(self) -> Self: - """ - Create a single chunk of memory for this Series. - - Parameters - ---------- - in_place - In place or not. - - """ - def reverse(self) -> Series: - ''' - Return Series in reverse order. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) - >>> s.reverse() - shape: (3,) - Series: \'a\' [i8] - [ - 3 - 2 - 1 - ] - - ''' - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: - ''' - Get a boolean mask of the values that fall between the given start/end values. - - Parameters - ---------- - lower_bound - Lower bound value. Accepts expression input. Non-expression inputs - (including strings) are parsed as literals. - upper_bound - Upper bound value. Accepts expression input. Non-expression inputs - (including strings) are parsed as literals. - closed : {\'both\', \'left\', \'right\', \'none\'} - Define which sides of the interval are closed (inclusive). - - Examples - -------- - >>> s = pl.Series("num", [1, 2, 3, 4, 5]) - >>> s.is_between(2, 4) - shape: (5,) - Series: \'num\' [bool] - [ - false - true - true - true - false - ] - - Use the `closed` argument to include or exclude the values at the bounds: - - >>> s.is_between(2, 4, closed="left") - shape: (5,) - Series: \'num\' [bool] - [ - false - true - true - false - false - ] - - You can also use strings as well as numeric/temporal values: - - >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) - >>> s.is_between("b", "d", closed="both") - shape: (5,) - Series: \'s\' [bool] - [ - false - true - true - true - false - ] - - ''' - def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: - ''' - Convert this Series to numpy. - - This operation may clone data but is completely safe. Note that: - - - data which is purely numeric AND without null values is not cloned; - - floating point `nan` values can be zero-copied; - - booleans can\'t be zero-copied. - - To ensure that no data is cloned, set `zero_copy_only=True`. - - Parameters - ---------- - *args - args will be sent to pyarrow.Array.to_numpy. - zero_copy_only - If True, an exception will be raised if the conversion to a numpy - array would require copying the underlying data (e.g. in presence - of nulls, or for non-primitive types). - writable - For numpy arrays created with zero copy (view on the Arrow data), - the resulting array is not writable (Arrow data is immutable). - By setting this to True, a copy of the array is made to ensure - it is writable. - use_pyarrow - Use `pyarrow.Array.to_numpy - `_ - - for the conversion to numpy. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> arr = s.to_numpy() - >>> arr # doctest: +IGNORE_RESULT - array([1, 2, 3], dtype=int64) - >>> type(arr) - - - ''' - def _view(self) -> SeriesView: - ''' - Get a view into this Series data with a numpy array. - - This operation doesn\'t clone data, but does not include missing values. - - Returns - ------- - SeriesView - - Parameters - ---------- - ignore_nulls - If True then nulls are converted to 0. - If False then an Exception is raised if nulls are present. - - Examples - -------- - >>> s = pl.Series("a", [1, None]) - >>> s._view(ignore_nulls=True) - SeriesView([1, 0]) - - ''' - def to_arrow(self) -> pa.Array: - ''' - Get the underlying Arrow Array. - - If the Series contains only a single chunk this operation is zero copy. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s = s.to_arrow() - >>> s # doctest: +ELLIPSIS - - [ - 1, - 2, - 3 - ] - - ''' - def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: - ''' - Convert this Series to a pandas Series. - - This requires that :mod:`pandas` and :mod:`pyarrow` are installed. - This operation clones data, unless `use_pyarrow_extension_array=True`. - - Parameters - ---------- - use_pyarrow_extension_array - Further operations on this Pandas series, might trigger conversion to numpy. - Use PyArrow backed-extension array instead of numpy array for pandas - Series. This allows zero copy operations and preservation of nulls - values. - Further operations on this pandas Series, might trigger conversion - to NumPy arrays if that operation is not supported by pyarrow compute - functions. - kwargs - Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. - - Examples - -------- - >>> s1 = pl.Series("a", [1, 2, 3]) - >>> s1.to_pandas() - 0 1 - 1 2 - 2 3 - Name: a, dtype: int64 - >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP - 0 1 - 1 2 - 2 3 - Name: a, dtype: int64[pyarrow] - >>> s2 = pl.Series("b", [1, 2, None, 4]) - >>> s2.to_pandas() - 0 1.0 - 1 2.0 - 2 NaN - 3 4.0 - Name: b, dtype: float64 - >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP - 0 1 - 1 2 - 2 - 3 4 - Name: b, dtype: int64[pyarrow] - - ''' - def to_init_repr(self, n: int = ...) -> str: - ''' - Convert Series to instantiatable string representation. - - Parameters - ---------- - n - Only use first n elements. - - See Also - -------- - polars.Series.to_init_repr - polars.from_repr - - Examples - -------- - >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) - >>> print(s.to_init_repr()) - pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) - >>> s_from_str_repr = eval(s.to_init_repr()) - >>> s_from_str_repr - shape: (4,) - Series: \'a\' [i16] - [ - 1 - 2 - null - 4 - ] - - ''' - def set(self, filter: Series, value: int | float | str | bool | None) -> Series: - ''' - Set masked values. - - Parameters - ---------- - filter - Boolean mask. - value - Value with which to replace the masked values. - - Notes - ----- - Use of this function is frequently an anti-pattern, as it can - block optimisation (predicate pushdown, etc). Consider using - `pl.when(predicate).then(value).otherwise(self)` instead. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.set(s == 2, 10) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 10 - 3 - ] - - It is better to implement this as follows: - - >>> s.to_frame().select( - ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) - ... ) - shape: (3, 1) - ┌─────────┐ - │ literal │ - │ --- │ - │ i64 │ - ╞═════════╡ - │ 1 │ - │ 10 │ - │ 3 │ - └─────────┘ - - ''' - def scatter(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: - ''' - Set values at the index locations. - - Parameters - ---------- - indices - Integers representing the index locations. - values - Replacement values. - - Notes - ----- - Use of this function is frequently an anti-pattern, as it can - block optimization (predicate pushdown, etc). Consider using - `pl.when(predicate).then(value).otherwise(self)` instead. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.scatter(1, 10) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 10 - 3 - ] - - It is better to implement this as follows: - - >>> s.to_frame().with_row_count("row_nr").select( - ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) - ... ) - shape: (3, 1) - ┌─────────┐ - │ literal │ - │ --- │ - │ i64 │ - ╞═════════╡ - │ 1 │ - │ 10 │ - │ 3 │ - └─────────┘ - - ''' - def clear(self, n: int = ...) -> Series: - ''' - Create an empty copy of the current Series, with zero to \'n\' elements. - - The copy has an identical name/dtype, but no data. - - Parameters - ---------- - n - Number of (empty) elements to return in the cleared frame. - - See Also - -------- - clone : Cheap deepcopy/clone. - - Examples - -------- - >>> s = pl.Series("a", [None, True, False]) - >>> s.clear() - shape: (0,) - Series: \'a\' [bool] - [ - ] - - >>> s.clear(n=2) - shape: (2,) - Series: \'a\' [bool] - [ - null - null - ] - - ''' - def clone(self) -> Self: - ''' - Create a copy of this Series. - - This is a cheap operation that does not copy data. - - See Also - -------- - clear : Create an empty copy of the current Series, with identical - schema but no data. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.clone() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def fill_nan(self, value: int | float | Expr | None) -> Series: - ''' - Fill floating point NaN value with a fill value. - - Parameters - ---------- - value - Value used to fill NaN values. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, float("nan")]) - >>> s.fill_nan(0) - shape: (4,) - Series: \'a\' [f64] - [ - 1.0 - 2.0 - 3.0 - 0.0 - ] - - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: - ''' - Fill null values using the specified value or strategy. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, None]) - >>> s.fill_null(strategy="forward") - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 3 - ] - >>> s.fill_null(strategy="min") - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 1 - ] - >>> s = pl.Series("b", ["x", None, "z"]) - >>> s.fill_null(pl.lit("")) - shape: (3,) - Series: \'b\' [str] - [ - "x" - "" - "z" - ] - - ''' - def floor(self) -> Series: - ''' - Rounds down to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) - >>> s.floor() - shape: (3,) - Series: \'a\' [f64] - [ - 1.0 - 2.0 - 3.0 - ] - - ''' - def ceil(self) -> Series: - ''' - Rounds up to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) - >>> s.ceil() - shape: (3,) - Series: \'a\' [f64] - [ - 2.0 - 3.0 - 4.0 - ] - - ''' - def round(self, decimals: int = ...) -> Series: - ''' - Round underlying floating point data by `decimals` digits. - - Examples - -------- - >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) - >>> s.round(2) - shape: (3,) - Series: \'a\' [f64] - [ - 1.12 - 2.57 - 3.9 - ] - - Parameters - ---------- - decimals - number of decimals to round by. - - ''' - def round_sig_figs(self, digits: int) -> Series: - """ - Round to a number of significant figures. - - Parameters - ---------- - digits - Number of significant figures to round to. - - Examples - -------- - >>> s = pl.Series([0.01234, 3.333, 1234.0]) - >>> s.round_sig_figs(2) - shape: (3,) - Series: '' [f64] - [ - 0.012 - 3.3 - 1200.0 - ] - - """ - def dot(self, other: Series | ArrayLike) -> float | None: - ''' - Compute the dot/inner product between two Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) - >>> s.dot(s2) - 32.0 - - Parameters - ---------- - other - Series (or array) to compute dot product with. - - ''' - def mode(self) -> Series: - ''' - Compute the most occurring value(s). - - Can return multiple Values. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.mode() - shape: (1,) - Series: \'a\' [i64] - [ - 2 - ] - - ''' - def sign(self) -> Series: - ''' - Compute the element-wise indication of the sign. - - The returned values can be -1, 0, or 1: - - * -1 if x < 0. - * 0 if x == 0. - * 1 if x > 0. - - (null values are preserved as-is). - - Examples - -------- - >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) - >>> s.sign() - shape: (5,) - Series: \'a\' [i64] - [ - -1 - 0 - 0 - 1 - null - ] - - ''' - def sin(self) -> Series: - ''' - Compute the element-wise value for the sine. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.sin() - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 1.0 - 1.2246e-16 - ] - - ''' - def cos(self) -> Series: - ''' - Compute the element-wise value for the cosine. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.cos() - shape: (3,) - Series: \'a\' [f64] - [ - 1.0 - 6.1232e-17 - -1.0 - ] - - ''' - def tan(self) -> Series: - ''' - Compute the element-wise value for the tangent. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.tan() - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 1.6331e16 - -1.2246e-16 - ] - - ''' - def cot(self) -> Series: - ''' - Compute the element-wise value for the cotangent. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.cot() - shape: (3,) - Series: \'a\' [f64] - [ - inf - 6.1232e-17 - -8.1656e15 - ] - - ''' - def arcsin(self) -> Series: - ''' - Compute the element-wise value for the inverse sine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arcsin() - shape: (3,) - Series: \'a\' [f64] - [ - 1.570796 - 0.0 - -1.570796 - ] - - ''' - def arccos(self) -> Series: - ''' - Compute the element-wise value for the inverse cosine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arccos() - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 1.570796 - 3.141593 - ] - - ''' - def arctan(self) -> Series: - ''' - Compute the element-wise value for the inverse tangent. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arctan() - shape: (3,) - Series: \'a\' [f64] - [ - 0.785398 - 0.0 - -0.785398 - ] - - ''' - def arcsinh(self) -> Series: - ''' - Compute the element-wise value for the inverse hyperbolic sine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arcsinh() - shape: (3,) - Series: \'a\' [f64] - [ - 0.881374 - 0.0 - -0.881374 - ] - - ''' - def arccosh(self) -> Series: - ''' - Compute the element-wise value for the inverse hyperbolic cosine. - - Examples - -------- - >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) - >>> s.arccosh() - shape: (4,) - Series: \'a\' [f64] - [ - 2.292432 - 0.0 - NaN - NaN - ] - - ''' - def arctanh(self) -> Series: - ''' - Compute the element-wise value for the inverse hyperbolic tangent. - - Examples - -------- - >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) - >>> s.arctanh() - shape: (7,) - Series: \'a\' [f64] - [ - NaN - inf - 0.549306 - 0.0 - -0.549306 - -inf - NaN - ] - - ''' - def sinh(self) -> Series: - ''' - Compute the element-wise value for the hyperbolic sine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.sinh() - shape: (3,) - Series: \'a\' [f64] - [ - 1.175201 - 0.0 - -1.175201 - ] - - ''' - def cosh(self) -> Series: - ''' - Compute the element-wise value for the hyperbolic cosine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.cosh() - shape: (3,) - Series: \'a\' [f64] - [ - 1.543081 - 1.0 - 1.543081 - ] - - ''' - def tanh(self) -> Series: - ''' - Compute the element-wise value for the hyperbolic tangent. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.tanh() - shape: (3,) - Series: \'a\' [f64] - [ - 0.761594 - 0.0 - -0.761594 - ] - - ''' - def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - ''' - Map a custom/user-defined function (UDF) over elements in this Series. - - .. warning:: - This method is much slower than the native expressions API. - Only use it if you cannot implement your logic otherwise. - - If the function returns a different datatype, the return_dtype arg should - be set, otherwise the method will fail. - - Implementing logic using a Python function is almost always *significantly* - slower and more memory intensive than implementing the same logic using - the native expression API because: - - - The native expression engine runs in Rust; UDFs run in Python. - - Use of Python UDFs forces the DataFrame to be materialized in memory. - - Polars-native expressions can be parallelised (UDFs typically cannot). - - Polars-native expressions can be logically optimised (UDFs cannot). - - Wherever possible you should strongly prefer the native expression API - to achieve the best performance. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output datatype. If none is given, the same datatype as this Series will be - used. - skip_nulls - Nulls will be skipped and not passed to the python function. - This is faster because python can be skipped and because we call - more specialized functions. - - Warnings - -------- - If `return_dtype` is not provided, this may lead to unexpected results. - We allow this, but it is considered a bug in the user\'s query. - - Notes - ----- - If your function is expensive and you don\'t want it to be called more than - once for a given input, consider applying an `@lru_cache` decorator to it. - If your data is suitable you may achieve *significant* speedups. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP - shape: (3,) - Series: \'a\' [i64] - [ - 11 - 12 - 13 - ] - - Returns - ------- - Series - - ''' - def shift(self, n: int = ...) -> Series: - """ - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. Accepts expression input. - Non-expression inputs are parsed as literals. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> s = pl.Series([1, 2, 3, 4]) - >>> s.shift() - shape: (4,) - Series: '' [i64] - [ - null - 1 - 2 - 3 - ] - - Pass a negative value to shift in the opposite direction instead. - - >>> s.shift(-2) - shape: (4,) - Series: '' [i64] - [ - 3 - 4 - null - null - ] - - Specify `fill_value` to fill the resulting null values. - - >>> s.shift(-2, fill_value=100) - shape: (4,) - Series: '' [i64] - [ - 3 - 4 - 100 - 100 - ] - - """ - def zip_with(self, mask: Series, other: Series) -> Self: - """ - Take values from self or other based on the given mask. - - Where mask evaluates true, take values from self. Where mask evaluates false, - take values from other. - - Parameters - ---------- - mask - Boolean Series. - other - Series of same type. - - Returns - ------- - Series - - Examples - -------- - >>> s1 = pl.Series([1, 2, 3, 4, 5]) - >>> s2 = pl.Series([5, 4, 3, 2, 1]) - >>> s1.zip_with(s1 < s2, s2) - shape: (5,) - Series: '' [i64] - [ - 1 - 2 - 3 - 2 - 1 - ] - >>> mask = pl.Series([True, False, True, False, True]) - >>> s1.zip_with(mask, s2) - shape: (5,) - Series: '' [i64] - [ - 1 - 4 - 3 - 2 - 5 - ] - - """ - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling min (moving min) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their min. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [100, 200, 300, 400, 500]) - >>> s.rolling_min(window_size=3) - shape: (5,) - Series: \'a\' [i64] - [ - null - null - 100 - 200 - 300 - ] - - ''' - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling max (moving max) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their max. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [100, 200, 300, 400, 500]) - >>> s.rolling_max(window_size=2) - shape: (5,) - Series: \'a\' [i64] - [ - null - 200 - 300 - 400 - 500 - ] - - ''' - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling mean (moving mean) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their mean. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [100, 200, 300, 400, 500]) - >>> s.rolling_mean(window_size=2) - shape: (5,) - Series: \'a\' [f64] - [ - null - 150.0 - 250.0 - 350.0 - 450.0 - ] - - ''' - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling sum (moving sum) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their sum. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length of the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.rolling_sum(window_size=2) - shape: (5,) - Series: \'a\' [i64] - [ - null - 3 - 5 - 7 - 9 - ] - - ''' - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling std dev. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their std dev. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_std(window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.0 - 1.0 - 1.527525 - 2.0 - ] - - ''' - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling variance. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their variance. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_var(window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.0 - 1.0 - 2.333333 - 4.0 - ] - - ''' - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a custom rolling window function. - - .. warning:: - Computing custom functions is extremely slow. Use specialized rolling - functions such as :func:`Series.rolling_sum` if at all possible. - - Parameters - ---------- - function - Custom aggregation function. - window_size - Size of the window. The window at a given row will include the row - itself and the `window_size - 1` elements before it. - weights - A list of weights with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window. - - Warnings - -------- - - - Examples - -------- - >>> from numpy import nansum - >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) - >>> s.rolling_map(nansum, window_size=3) - shape: (5,) - Series: \'\' [f64] - [ - null - null - 22.0 - 11.0 - 17.0 - ] - - ''' - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling median. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_median(window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 2.0 - 3.0 - 4.0 - 6.0 - ] - - ''' - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling quantile. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_quantile(quantile=0.33, window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.0 - 2.0 - 3.0 - 4.0 - ] - >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.66 - 2.66 - 3.66 - 5.32 - ] - - ''' - def rolling_skew(self, window_size: int) -> Series: - """ - Compute a rolling skew. - - The window at a given row includes the row itself and the - `window_size - 1` elements before it. - - Parameters - ---------- - window_size - Integer size of the rolling window. - bias - If False, the calculations are corrected for statistical bias. - - Examples - -------- - >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) - shape: (4,) - Series: '' [f64] - [ - null - null - 0.381802 - 0.47033 - ] - - Note how the values match - - >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() - (0.38180177416060584, 0.47033046033698594) - - """ - def sample(self, n: int | None = ...) -> Series: - ''' - Sample from this Series. - - Parameters - ---------- - n - Number of items to return. Cannot be used with `fraction`. Defaults to 1 if - `fraction` is None. - fraction - Fraction of items to return. Cannot be used with `n`. - with_replacement - Allow values to be sampled more than once. - shuffle - Shuffle the order of sampled data points. - seed - Seed for the random number generator. If set to None (default), a - random seed is generated for each sample operation. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 5 - ] - - ''' - def peak_max(self) -> Self: - ''' - Get a boolean mask of the local maximum peaks. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.peak_max() - shape: (5,) - Series: \'a\' [bool] - [ - false - false - false - false - true - ] - - ''' - def peak_min(self) -> Self: - ''' - Get a boolean mask of the local minimum peaks. - - Examples - -------- - >>> s = pl.Series("a", [4, 1, 3, 2, 5]) - >>> s.peak_min() - shape: (5,) - Series: \'a\' [bool] - [ - false - true - false - true - false - ] - - ''' - def n_unique(self) -> int: - ''' - Count the number of unique values in this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.n_unique() - 3 - - ''' - def shrink_to_fit(self) -> Series: - """ - Shrink Series memory usage. - - Shrinks the underlying array capacity to exactly fit the actual data. - (Note that this function does not change the Series data type). - - """ - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: - ''' - Hash the Series. - - The hash value is of type `UInt64`. - - Parameters - ---------- - seed - Random seed parameter. Defaults to 0. - seed_1 - Random seed parameter. Defaults to `seed` if not set. - seed_2 - Random seed parameter. Defaults to `seed` if not set. - seed_3 - Random seed parameter. Defaults to `seed` if not set. - - Notes - ----- - This implementation of :func:`hash` does not guarantee stable results - across different Polars versions. Its stability is only guaranteed within a - single version. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.hash(seed=42) # doctest: +IGNORE_RESULT - shape: (3,) - Series: \'a\' [u64] - [ - 10734580197236529959 - 3022416320763508302 - 13756996518000038261 - ] - - ''' - def reinterpret(self) -> Series: - """ - Reinterpret the underlying bits as a signed/unsigned integer. - - This operation is only allowed for 64bit integers. For lower bits integers, - you can safely use that cast operation. - - Parameters - ---------- - signed - If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. - - """ - def interpolate(self, method: InterpolationMethod = ...) -> Series: - ''' - Fill null values using interpolation. - - Parameters - ---------- - method : {\'linear\', \'nearest\'} - Interpolation method. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, None, None, 5]) - >>> s.interpolate() - shape: (5,) - Series: \'a\' [f64] - [ - 1.0 - 2.0 - 3.0 - 4.0 - 5.0 - ] - - ''' - def abs(self) -> Series: - """ - Compute absolute values. - - Same as `abs(series)`. - """ - def rank(self, method: RankMethod = ...) -> Series: - ''' - Assign ranks to data, dealing with ties appropriately. - - Parameters - ---------- - method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} - The method used to assign ranks to tied elements. - The following methods are available (default is \'average\'): - - - \'average\' : The average of the ranks that would have been assigned to - all the tied values is assigned to each value. - - \'min\' : The minimum of the ranks that would have been assigned to all - the tied values is assigned to each value. (This is also referred to - as "competition" ranking.) - - \'max\' : The maximum of the ranks that would have been assigned to all - the tied values is assigned to each value. - - \'dense\' : Like \'min\', but the rank of the next highest element is - assigned the rank immediately after those assigned to the tied - elements. - - \'ordinal\' : All values are given a distinct rank, corresponding to - the order that the values occur in the Series. - - \'random\' : Like \'ordinal\', but the rank for ties is not dependent - on the order that the values occur in the Series. - descending - Rank in descending order. - seed - If `method="random"`, use this as seed. - - Examples - -------- - The \'average\' method: - - >>> s = pl.Series("a", [3, 6, 1, 1, 6]) - >>> s.rank() - shape: (5,) - Series: \'a\' [f64] - [ - 3.0 - 4.5 - 1.5 - 1.5 - 4.5 - ] - - The \'ordinal\' method: - - >>> s = pl.Series("a", [3, 6, 1, 1, 6]) - >>> s.rank("ordinal") - shape: (5,) - Series: \'a\' [u32] - [ - 3 - 4 - 1 - 2 - 5 - ] - - ''' - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: - ''' - Calculate the first discrete difference between shifted items. - - Parameters - ---------- - n - Number of slots to shift. - null_behavior : {\'ignore\', \'drop\'} - How to handle null values. - - Examples - -------- - >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) - >>> s.diff() - shape: (5,) - Series: \'s\' [i8] - [ - null - -10 - 20 - -5 - 10 - ] - - >>> s.diff(n=2) - shape: (5,) - Series: \'s\' [i8] - [ - null - null - 10 - 15 - 5 - ] - - >>> s.diff(n=2, null_behavior="drop") - shape: (3,) - Series: \'s\' [i8] - [ - 10 - 15 - 5 - ] - - ''' - def pct_change(self, n: int | IntoExprColumn = ...) -> Series: - """ - Computes percentage change between values. - - Percentage change (as fraction) between current element and most-recent - non-null element at least `n` period(s) before the current element. - - Computes the change from the previous row by default. - - Parameters - ---------- - n - periods to shift for forming percent change. - - Examples - -------- - >>> pl.Series(range(10)).pct_change() - shape: (10,) - Series: '' [f64] - [ - null - inf - 1.0 - 0.5 - 0.333333 - 0.25 - 0.2 - 0.166667 - 0.142857 - 0.125 - ] - - >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) - shape: (10,) - Series: '' [f64] - [ - null - null - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - ] - - """ - def skew(self) -> float | None: - """ - Compute the sample skewness of a data set. - - For normally distributed data, the skewness should be about zero. For - unimodal continuous distributions, a skewness value greater than zero means - that there is more weight in the right tail of the distribution. The - function `skewtest` can be used to determine if the skewness value - is close enough to zero, statistically speaking. - - - See scipy.stats for more information. - - Parameters - ---------- - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - Notes - ----- - The sample skewness is computed as the Fisher-Pearson coefficient - of skewness, i.e. - - .. math:: g_1=\\frac{m_3}{m_2^{3/2}} - - where - - .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i - - is the biased sample :math:`i\\texttt{th}` central moment, and - :math:`\\bar{x}` is - the sample mean. If `bias` is False, the calculations are - corrected for bias and the value computed is the adjusted - Fisher-Pearson standardized moment coefficient, i.e. - - .. math:: - G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} - - """ - def kurtosis(self) -> float | None: - """ - Compute the kurtosis (Fisher or Pearson) of a dataset. - - Kurtosis is the fourth central moment divided by the square of the - variance. If Fisher's definition is used, then 3.0 is subtracted from - the result to give 0.0 for a normal distribution. - If bias is False then the kurtosis is calculated using k statistics to - eliminate bias coming from biased moment estimators - - See scipy.stats for more information - - Parameters - ---------- - fisher : bool, optional - If True, Fisher's definition is used (normal ==> 0.0). If False, - Pearson's definition is used (normal ==> 3.0). - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - """ - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: - """ - Set values outside the given boundaries to the boundary value. - - Parameters - ---------- - lower_bound - Lower bound. Accepts expression input. - Non-expression inputs are parsed as literals. - If set to `None` (default), no lower bound is applied. - upper_bound - Upper bound. Accepts expression input. - Non-expression inputs are parsed as literals. - If set to `None` (default), no upper bound is applied. - - See Also - -------- - when - - Notes - ----- - This method only works for numeric and temporal columns. To clip other data - types, consider writing a `when-then-otherwise` expression. See :func:`when`. - - Examples - -------- - Specifying both a lower and upper bound: - - >>> s = pl.Series([-50, 5, 50, None]) - >>> s.clip(1, 10) - shape: (4,) - Series: '' [i64] - [ - 1 - 5 - 10 - null - ] - - Specifying only a single bound: - - >>> s.clip(upper_bound=10) - shape: (4,) - Series: '' [i64] - [ - -50 - 5 - 10 - null - ] - - """ - def lower_bound(self) -> Self: - ''' - Return the lower bound of this Series\' dtype as a unit Series. - - See Also - -------- - upper_bound : return the upper bound of the given Series\' dtype. - - Examples - -------- - >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) - >>> s.lower_bound() - shape: (1,) - Series: \'s\' [i32] - [ - -2147483648 - ] - - >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) - >>> s.lower_bound() - shape: (1,) - Series: \'s\' [f32] - [ - -inf - ] - - ''' - def upper_bound(self) -> Self: - ''' - Return the upper bound of this Series\' dtype as a unit Series. - - See Also - -------- - lower_bound : return the lower bound of the given Series\' dtype. - - Examples - -------- - >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) - >>> s.upper_bound() - shape: (1,) - Series: \'s\' [i8] - [ - 127 - ] - - >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) - >>> s.upper_bound() - shape: (1,) - Series: \'s\' [f64] - [ - inf - ] - - ''' - def replace(self, mapping: dict[Any, Any]) -> Self: - ''' - Replace values according to the given mapping. - - Needs a global string cache for lazily evaluated queries on columns of - type `Categorical`. - - Parameters - ---------- - mapping - Mapping of values to their replacement. - default - Value to use when the mapping does not contain the lookup value. - Defaults to keeping the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - - See Also - -------- - str.replace - - Examples - -------- - Replace a single value by another value. Values not in the mapping remain - unchanged. - - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.replace({2: 100}) - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 100 - 100 - 3 - ] - - Replace multiple values. Specify a default to set values not in the given map - to the default value. - - >>> s = pl.Series("country_code", ["FR", "ES", "DE", None]) - >>> country_code_map = { - ... "CA": "Canada", - ... "DE": "Germany", - ... "FR": "France", - ... None: "unspecified", - ... } - >>> s.replace(country_code_map, default=None) - shape: (4,) - Series: \'country_code\' [str] - [ - "France" - null - "Germany" - "unspecified" - ] - - The return type can be overridden with the `return_dtype` argument. - - >>> s = pl.Series("a", [0, 1, 2, 3]) - >>> s.replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) - shape: (4,) - Series: \'a\' [u8] - [ - 0 - 10 - 20 - 0 - ] - ''' - def reshape(self, dimensions: tuple[int, ...]) -> Series: - ''' - Reshape this Series to a flat Series or a Series of Lists. - - Parameters - ---------- - dimensions - Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that - dimension is inferred. - - Returns - ------- - Series - If a single dimension is given, results in a Series of the original - data type. - If a multiple dimensions are given, results in a Series of data type - :class:`List` with shape (rows, cols). - - See Also - -------- - Series.list.explode : Explode a list column. - - Examples - -------- - >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) - >>> s.reshape((3, 3)) - shape: (3,) - Series: \'foo\' [list[i64]] - [ - [1, 2, 3] - [4, 5, 6] - [7, 8, 9] - ] - - ''' - def shuffle(self, seed: int | None = ...) -> Series: - ''' - Shuffle the contents of this Series. - - Parameters - ---------- - seed - Seed for the random number generator. If set to None (default), a - random seed is generated each time the shuffle is called. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.shuffle(seed=1) - shape: (3,) - Series: \'a\' [i64] - [ - 2 - 1 - 3 - ] - - ''' - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: - """ - Exponentially-weighted moving average. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> s = pl.Series([1, 2, 3]) - >>> s.ewm_mean(com=1) - shape: (3,) - Series: '' [f64] - [ - 1.0 - 1.666667 - 2.428571 - ] - - """ - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: - ''' - Exponentially-weighted moving standard deviation. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.ewm_std(com=1) - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 0.707107 - 0.963624 - ] - - ''' - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: - ''' - Exponentially-weighted moving variance. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.ewm_var(com=1) - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 0.5 - 0.928571 - ] - - ''' - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: - """ - Extremely fast method for extending the Series with 'n' copies of a value. - - Parameters - ---------- - value - A constant literal value (not an expression) with which to extend - the Series; can pass None to extend with nulls. - n - The number of additional values that will be added. - - Examples - -------- - >>> s = pl.Series([1, 2, 3]) - >>> s.extend_constant(99, n=2) - shape: (5,) - Series: '' [i64] - [ - 1 - 2 - 3 - 99 - 99 - ] - - """ - def set_sorted(self) -> Self: - ''' - Flags the Series as \'sorted\'. - - Enables downstream code to user fast paths for sorted arrays. - - Parameters - ---------- - descending - If the `Series` order is descending. - - Warnings - -------- - This can lead to incorrect results if this `Series` is not sorted!! - Use with care! - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.set_sorted().max() - 3 - - ''' - def new_from_index(self, index: int, length: int) -> Self: - """Create a new Series filled with values from the given index.""" - def shrink_dtype(self) -> Series: - """ - Shrink numeric columns to the minimal required datatype. - - Shrink to the dtype needed to fit the extrema of this [`Series`]. - This can be used to reduce memory pressure. - """ - def get_chunks(self) -> list[Series]: - """Get the chunks of this Series as a list of Series.""" - def implode(self) -> Self: - """Aggregate values into a list.""" - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - """ - Apply a custom/user-defined function (UDF) over elements in this Series. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Series.map_elements`. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output datatype. If none is given, the same datatype as this Series will be - used. - skip_nulls - Nulls will be skipped and not passed to the python function. - This is faster because python can be skipped and because we call - more specialized functions. - - """ - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - """ - Apply a custom rolling window function. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Series.rolling_map`. - - Parameters - ---------- - function - Aggregation function - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - """ - def is_first(self) -> Series: - """ - Return a boolean mask indicating the first occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Series.is_first_distinct`. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - """ - def is_last(self) -> Series: - """ - Return a boolean mask indicating the last occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Series.is_last_distinct`. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - """ - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: - """ - Clip (limit) the values in an array to a `min` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - lower_bound - Lower bound. - - """ - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: - """ - Clip (limit) the values in an array to a `max` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - upper_bound - Upper bound. - - """ - def shift_and_fill(self, fill_value: int | Expr) -> Series: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - Fill None values with the result of this expression. - n - Number of places to shift (may be negative). - - """ - def is_float(self) -> bool: - ''' - Check if this Series has floating point numbers. - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_float()` instead. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0]) - >>> s.is_float() # doctest: +SKIP - True - - ''' - def is_integer(self, signed: bool | None = ...) -> bool: - ''' - Check if this Series datatype is an integer (signed or unsigned). - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_integer()` instead. - For signed/unsigned variants, use `Series.dtype.is_signed_integer()` - or `Series.dtype.is_unsigned_integer()`. - - Parameters - ---------- - signed - * if `None`, both signed and unsigned integer dtypes will match. - * if `True`, only signed integer dtypes will be considered a match. - * if `False`, only unsigned integer dtypes will be considered a match. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) - >>> s.is_integer() # doctest: +SKIP - True - >>> s.is_integer(signed=False) # doctest: +SKIP - True - >>> s.is_integer(signed=True) # doctest: +SKIP - False - - ''' - def is_numeric(self) -> bool: - ''' - Check if this Series datatype is numeric. - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_float()` instead. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.is_numeric() # doctest: +SKIP - True - - ''' - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: - """ - Check if this Series datatype is temporal. - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_temporal()` instead. - - Parameters - ---------- - excluding - Optionally exclude one or more temporal dtypes from matching. - - Examples - -------- - >>> from datetime import date - >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) - >>> s.is_temporal() # doctest: +SKIP - True - >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP - False - - """ - def is_boolean(self) -> bool: - ''' - Check if this Series is a Boolean. - - .. deprecated:: 0.19.14 - Use `Series.dtype == pl.Boolean` instead. - - Examples - -------- - >>> s = pl.Series("a", [True, False, True]) - >>> s.is_boolean() # doctest: +SKIP - True - - ''' - def is_utf8(self) -> bool: - ''' - Check if this Series datatype is a Utf8. - - .. deprecated:: 0.19.14 - Use `Series.dtype == pl.Utf8` instead. - - Examples - -------- - >>> s = pl.Series("x", ["a", "b", "c"]) - >>> s.is_utf8() # doctest: +SKIP - True - - ''' - def take_every(self, n: int) -> Series: - """ - Take every nth value in the Series and return as new Series. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: - """ - Take values by index. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather`. - - Parameters - ---------- - indices - Index location used for selection. - """ - def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: - """ - Set values at the index locations. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`scatter`. - - Parameters - ---------- - indices - Integers representing the index locations. - values - Replacement values. - """ - def cumsum(self) -> Series: - """ - Get an array with the cumulative sum computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_sum`. - - Parameters - ---------- - reverse - reverse the operation. - - """ - def cummax(self) -> Series: - """ - Get an array with the cumulative max computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_max`. - - Parameters - ---------- - reverse - reverse the operation. - """ - def cummin(self) -> Series: - """ - Get an array with the cumulative min computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_min`. - - Parameters - ---------- - reverse - reverse the operation. - """ - def cumprod(self) -> Series: - """ - Get an array with the cumulative product computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_prod`. - - Parameters - ---------- - reverse - reverse the operation. - """ - def view(self) -> SeriesView: - """ - Get a view into this Series data with a numpy array. - - .. deprecated:: 0.19.14 - This method will be removed in a future version. - - This operation doesn't clone data, but does not include missing values. - Don't use this unless you know what you are doing. - - Parameters - ---------- - ignore_nulls - If True then nulls are converted to 0. - If False then an Exception is raised if nulls are present. - - """ - def map_dict(self, mapping: dict[Any, Any]) -> Self: - """ - Replace values in the Series using a remapping dictionary. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`replace`. The default behavior - has changed to keep any values not present in the mapping unchanged. - Pass `default=None` to keep existing behavior. - - Parameters - ---------- - mapping - Dictionary containing the before/after values to map. - default - Value to use when the remapping dict does not contain the lookup value. - Use `pl.first()`, to keep the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - """ - def series_equal(self, other: Series) -> bool: - """ - Check whether the Series is equal to another Series. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`equals`. - - Parameters - ---------- - other - Series to compare with. - null_equal - Consider null values as equal. - strict - Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a - `pl.Int64` will return `False`. - """ - @property - def dtype(self): ... - @property - def flags(self): ... - @property - def inner_dtype(self): ... - @property - def name(self): ... - @property - def shape(self): ... - @property - def bin(self): ... - @property - def cat(self): ... - @property - def dt(self): ... - @property - def list(self): ... - @property - def arr(self): ... - @property - def str(self): ... - @property - def struct(self): ... -def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: - """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/series/series.pyi new file mode 100644 index 0000000..8df1e63 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.0/polars/series/series.pyi @@ -0,0 +1,5035 @@ +#: version 0.20.0 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Enum as Enum, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Null as Null, Object as Object, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import ModuleUpgradeRequired as ModuleUpgradeRequired, ShapeError as ShapeError +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, _warn_null_comparison as _warn_null_comparison, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the `Series` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series <= other`.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series < other`.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series == other`.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series == other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series != other`.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series != other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series >= other`.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series > other`.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + ''' + Return the Series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to `s[0]`, with a check + that the shape is (1,). With an index, this is equivalent to `s[index]`. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cum_sum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a Series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + Series has a numeric dtype). All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> s = pl.Series([1, 2, 3, 4, 5]) + >>> s.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + Non-numeric data types may not have all statistics available. + + >>> s = pl.Series(["a", "a", None, "b", "c"]) + >>> s.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 4 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + include_breakpoint + Include a column that indicates the upper breakpoint. + include_category + Include a column that shows the intervals as categories. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬───────┐ + │ break_point ┆ category ┆ count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═══════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴───────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬───────┐ + │ color ┆ count │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═══════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴───────┘ + + Sort the output by count. + + >>> s.value_counts(sort=True) + shape: (3, 2) + ┌───────┬───────┐ + │ color ┆ count │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═══════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴───────┘ + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cum_max(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cum_max() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cum_min(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cum_min() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cum_prod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_prod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cum_sum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_sum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + The resulting series will consist of multiple chunks. + + Parameters + ---------- + other + Series to append. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from `append`, which adds the chunks from `other` to the chunks of + this series, `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer `append` over `extend` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single `Series`. In the latter case, finish the sequence + of `append` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + head + + """ + def gather_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no `null` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have `null` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be `false`. + + To confirm that a column has `null` values use :func:`null_count`. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def equals(self, other: Series) -> bool: + ''' + Check whether the Series is equal to another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + See Also + -------- + assert_series_equal + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s1.equals(s1) + True + >>> s1.equals(s2) + False + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point `nan` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set `zero_copy_only=True`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def _view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + + Returns + ------- + SeriesView + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s._view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def count(self) -> int: + ''' + Return the number of non-null elements in the column. + + See Also + -------- + len + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.count() + 2 + ''' + def len(self) -> int: + ''' + Return the number of elements in the Series. + + Null values count towards the total. + + See Also + -------- + count + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.len() + 3 + ''' + def set(self, filter: Series, value: int | float | str | bool | None) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def scatter(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimization (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.scatter(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Create a copy of this Series. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def round_sig_figs(self, digits: int) -> Series: + """ + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> s = pl.Series([0.01234, 3.333, 1234.0]) + >>> s.round_sig_figs(2) + shape: (3,) + Series: '' [f64] + [ + 0.012 + 3.3 + 1200.0 + ] + + """ + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def cot(self) -> Series: + ''' + Compute the element-wise value for the cotangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cot() + shape: (3,) + Series: \'a\' [f64] + [ + inf + 6.1232e-17 + -8.1656e15 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, n: int = ...) -> Series: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> s = pl.Series([1, 2, 3, 4]) + >>> s.shift() + shape: (4,) + Series: '' [i64] + [ + null + 1 + 2 + 3 + ] + + Pass a negative value to shift in the opposite direction instead. + + >>> s.shift(-2) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + null + null + ] + + Specify `fill_value` to fill the resulting null values. + + >>> s.shift(-2, fill_value=100) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + 100 + 100 + ] + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their std dev. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: + """ + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no lower bound is applied. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no upper bound is applied. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> s = pl.Series([-50, 5, 50, None]) + >>> s.clip(1, 10) + shape: (4,) + Series: '' [i64] + [ + 1 + 5 + 10 + null + ] + + Specifying only a single bound: + + >>> s.clip(upper_bound=10) + shape: (4,) + Series: '' [i64] + [ + -50 + 5 + 10 + null + ] + + """ + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def replace(self, old: IntoExpr | Sequence[Any] | Mapping[Any, Any], new: IntoExpr | Sequence[Any] | NoDefault = ...) -> Self: + ''' + Replace values by different values. + + Parameters + ---------- + old + Value or sequence of values to replace. + Also accepts a mapping of values to their replacement as syntactic sugar for + `replace(new=Series(mapping.keys()), old=Series(mapping.values()))`. + new + Value or sequence of values to replace by. + Length must match the length of `old` or have length 1. + default + Set values that were not replaced to this value. + Defaults to keeping the original value. + Accepts expression input. Non-expression inputs are parsed as literals. + return_dtype + The data type of the resulting Series. If set to `None` (default), + the data type is determined automatically based on the other inputs. + + See Also + -------- + str.replace + + Notes + ----- + The global string cache must be enabled when replacing categorical values. + + Examples + -------- + Replace a single value by another value. Values that were not replaced remain + unchanged. + + >>> s = pl.Series([1, 2, 2, 3]) + >>> s.replace(2, 100) + shape: (4,) + Series: \'\' [i64] + [ + 1 + 100 + 100 + 3 + ] + + Replace multiple values by passing sequences to the `old` and `new` parameters. + + >>> s.replace([2, 3], [100, 200]) + shape: (4,) + Series: \'\' [i64] + [ + 1 + 100 + 100 + 200 + ] + + Passing a mapping with replacements is also supported as syntactic sugar. + Specify a default to set all values that were not matched. + + >>> mapping = {2: 100, 3: 200} + >>> s.replace(mapping, default=-1) + shape: (4,) + Series: \'\' [i64] + [ + -1 + 100 + 100 + 200 + ] + + + The default can be another Series. + + >>> default = pl.Series([2.5, 5.0, 7.5, 10.0]) + >>> s.replace(2, 100, default=default) + shape: (4,) + Series: \'\' [f64] + [ + 2.5 + 100.0 + 100.0 + 10.0 + ] + + Replacing by values of a different data type sets the return type based on + a combination of the `new` data type and either the original data type or the + default data type if it was set. + + >>> s = pl.Series(["x", "y", "z"]) + >>> mapping = {"x": 1, "y": 2, "z": 3} + >>> s.replace(mapping) + shape: (3,) + Series: \'\' [str] + [ + "1" + "2" + "3" + ] + >>> s.replace(mapping, default=None) + shape: (3,) + Series: \'\' [i64] + [ + 1 + 2 + 3 + ] + + Set the `return_dtype` parameter to control the resulting data type directly. + + >>> s.replace(mapping, return_dtype=pl.UInt8) + shape: (3,) + Series: \'\' [u8] + [ + 1 + 2 + 3 + ] + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.ewm_mean(com=1) + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.666667 + 2.428571 + ] + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() # doctest: +SKIP + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_integer()` instead. + For signed/unsigned variants, use `Series.dtype.is_signed_integer()` + or `Series.dtype.is_unsigned_integer()`. + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() # doctest: +SKIP + True + >>> s.is_integer(signed=False) # doctest: +SKIP + True + >>> s.is_integer(signed=True) # doctest: +SKIP + False + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_numeric()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() # doctest: +SKIP + True + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_temporal()` instead. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() # doctest: +SKIP + True + >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP + False + + """ + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Boolean` instead. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() # doctest: +SKIP + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Utf8` instead. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() # doctest: +SKIP + True + + ''' + def take_every(self, n: int) -> Series: + """ + Take every nth value in the Series and return as new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + Index location used for selection. + """ + def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + """ + Set values at the index locations. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`scatter`. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + """ + def cumsum(self) -> Series: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + reverse the operation. + + """ + def cummax(self) -> Series: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummin(self) -> Series: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cumprod(self) -> Series: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def view(self) -> SeriesView: + """ + Get a view into this Series data with a numpy array. + + .. deprecated:: 0.19.14 + This method will be removed in a future version. + + This operation doesn't clone data, but does not include missing values. + Don't use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in the Series using a remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + """ + def series_equal(self, other: Series) -> bool: + """ + Check whether the Series is equal to another Series. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`equals`. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: + """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/dataframe/frame deleted file mode 100644 index 562effd..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/dataframe/frame +++ /dev/null @@ -1,6977 +0,0 @@ -import P -import deltalake -import np as np -import pa as pa -import pd as pd -from _io import BytesIO, TextIOWrapper - -from builtins import PyDataFrame -from pathlib import Path -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes.classes import Boolean as Boolean, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 -from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.col import col as col -from polars.functions.lit import lit as lit -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte -from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors -from polars.slice import PolarsSlice as PolarsSlice -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence - -TYPE_CHECKING: bool -INTEGER_DTYPES: frozenset -N_INFER_DEFAULT: int -_PYARROW_AVAILABLE: bool -_dtype_str_repr: builtin_function_or_method - -class DataFrame: - _accessors: _ClassVar[set] = ... - columns: Incomplete - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: - """Construct Polars DataFrame from FFI PyDataFrame object.""" - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from a dictionary of sequences. - - Parameters - ---------- - data : dict of sequences - Two-dimensional data represented as a dictionary. dict must contain - Sequences. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - - """ - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from a sequence of sequences. - - Parameters - ---------- - data : Sequence of sequences - Two-dimensional data represented as a sequence of sequences. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - orient : {'col', 'row'}, default None - Whether to interpret two-dimensional data as columns or as rows. If None, - the orientation is inferred by matching the columns and data dimensions. If - this does not yield conclusive results, column orientation is used. - infer_schema_length - How many rows to scan to determine the column type. - - """ - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from a numpy ndarray. - - Parameters - ---------- - data : numpy ndarray - Two-dimensional data represented as a numpy ndarray. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - orient : {'col', 'row'}, default None - Whether to interpret two-dimensional data as columns or as rows. If None, - the orientation is inferred by matching the columns and data dimensions. If - this does not yield conclusive results, column orientation is used. - - """ - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from an Arrow table. - - This operation will be zero copy for the most part. Types that are not - supported by Polars may be cast to the closest supported type. - - Parameters - ---------- - data : arrow table, array, or sequence of sequences - Data representing an Arrow Table or Array. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - rechunk : bool, default True - Make sure that all data is in contiguous memory. - - """ - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a Polars DataFrame from a pandas DataFrame. - - Parameters - ---------- - data : pandas DataFrame - Two-dimensional data represented as a pandas DataFrame. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - rechunk : bool, default True - Make sure that all data is in contiguous memory. - nan_to_null : bool, default True - If the data contains NaN values they will be converted to null/None. - include_index : bool, default False - Load any non-default pandas indexes as columns. - - """ - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: - """ - Read a CSV file into a DataFrame. - - Use `pl.read_csv` to dispatch to this method. - - See Also - -------- - polars.io.read_csv - - """ - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: - """ - Read into a DataFrame from a parquet file. - - Use `pl.read_parquet` to dispatch to this method. - - See Also - -------- - polars.io.read_parquet - - """ - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: - """ - Read into a DataFrame from Apache Avro format. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - columns - Columns. - n_rows - Stop reading from Apache Avro file after reading `n_rows`. - - """ - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: - ''' - Read into a DataFrame from Arrow IPC file format. - - See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. - Arrow IPC files are also known as Feather (v2) files. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - columns - Columns to select. Accepts a list of column indices (starting at zero) or a - list of column names. - n_rows - Stop reading from IPC file after reading `n_rows`. - row_count_name - Row count name. - row_count_offset - Row count offset. - rechunk - Make sure that all data is contiguous. - memory_map - Memory map the file - - ''' - @classmethod - def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: - ''' - Read into a DataFrame from Arrow IPC record batch stream format. - - See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - columns - Columns to select. Accepts a list of column indices (starting at zero) or a - list of column names. - n_rows - Stop reading from IPC stream after reading `n_rows`. - row_count_name - Row count name. - row_count_offset - Row count offset. - rechunk - Make sure that all data is contiguous. - - ''' - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: - """ - Read into a DataFrame from a JSON file. - - Use `pl.read_json` to dispatch to this method. - - See Also - -------- - polars.io.read_json - - """ - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: - """ - Read into a DataFrame from a newline delimited JSON file. - - Use `pl.read_ndjson` to dispatch to this method. - - See Also - -------- - polars.io.read_ndjson - - """ - def _replace(self, column: str, new_column: Series) -> Self: - """Replace a column by a new Series (in place).""" - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: - """ - Numpy __array__ interface protocol. - - Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see - https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. - """ - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: - ''' - Convert to a dataframe object implementing the dataframe interchange protocol. - - Parameters - ---------- - nan_as_null - Overwrite null values in the data with `NaN`. - - .. warning:: - This functionality has not been implemented and the parameter will be - removed in a future version. - Setting this to `True` will raise a `NotImplementedError`. - allow_copy - Allow memory to be copied to perform the conversion. If set to `False`, - causes conversions that are not zero-copy to fail. - - Notes - ----- - Details on the Python dataframe interchange protocol: - https://data-apis.org/dataframe-protocol/latest/index.html - - Examples - -------- - Convert a Polars DataFrame to a generic dataframe object and access some - properties. - - >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) - >>> dfi = df.__dataframe__() - >>> dfi.num_rows() - 2 - >>> dfi.get_column(1).dtype - (, 64, \'g\', \'=\') - - ''' - def __dataframe_consortium_standard__(self) -> Any: - """ - Provide entry point to the Consortium DataFrame Standard API. - - This is developed and maintained outside of polars. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. - """ - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: - """Compare a DataFrame with another object.""" - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: - """Compare a DataFrame with another DataFrame.""" - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: - """Compare a DataFrame with a non-DataFrame object.""" - def _div(self, other: Any) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Series]: ... - def __reversed__(self) -> Iterator[Series]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: - """Get item. Does quite a lot. Read the comments.""" - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: - """ - Format output data in HTML for display in Jupyter Notebooks. - - Output rows and columns can be modified by setting the following ENVIRONMENT - variables: - - * POLARS_FMT_MAX_COLS: set the number of columns - * POLARS_FMT_MAX_ROWS: set the number of rows - - """ - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: - ''' - Return the DataFrame as a scalar, or return the element at the given row/column. - - Parameters - ---------- - row - Optional row index. - column - Optional column index or name. - - See Also - -------- - row: Get the values of a single row, either by index or by predicate. - - Notes - ----- - If row/col not provided, this is equivalent to `df[0,0]`, with a check that - the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> df.select((pl.col("a") * pl.col("b")).sum()).item() - 32 - >>> df.item(1, 1) - 5 - >>> df.item(2, "b") - 6 - - ''' - def to_arrow(self) -> pa.Table: - ''' - Collect the underlying arrow arrays in an Arrow Table. - - This operation is mostly zero copy. - - Data types that do copy: - - CategoricalType - - Examples - -------- - >>> df = pl.DataFrame( - ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} - ... ) - >>> df.to_arrow() - pyarrow.Table - foo: int64 - bar: large_string - ---- - foo: [[1,2,3,4,5,6]] - bar: [["a","b","c","d","e","f"]] - - ''' - def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: - ''' - Convert DataFrame to a dictionary mapping column name to values. - - Parameters - ---------- - as_series - True -> Values are Series - False -> Values are List[Any] - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4, 5], - ... "fruits": ["banana", "banana", "apple", "apple", "banana"], - ... "B": [5, 4, 3, 2, 1], - ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], - ... "optional": [28, 300, None, 2, -30], - ... } - ... ) - >>> df - shape: (5, 5) - ┌─────┬────────┬─────┬────────┬──────────┐ - │ A ┆ fruits ┆ B ┆ cars ┆ optional │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ - ╞═════╪════════╪═════╪════════╪══════════╡ - │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ - │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ - │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ - │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ - │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ - └─────┴────────┴─────┴────────┴──────────┘ - >>> df.to_dict(as_series=False) - {\'A\': [1, 2, 3, 4, 5], - \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], - \'B\': [5, 4, 3, 2, 1], - \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], - \'optional\': [28, 300, None, 2, -30]} - >>> df.to_dict(as_series=True) - {\'A\': shape: (5,) - Series: \'A\' [i64] - [ - 1 - 2 - 3 - 4 - 5 - ], \'fruits\': shape: (5,) - Series: \'fruits\' [str] - [ - "banana" - "banana" - "apple" - "apple" - "banana" - ], \'B\': shape: (5,) - Series: \'B\' [i64] - [ - 5 - 4 - 3 - 2 - 1 - ], \'cars\': shape: (5,) - Series: \'cars\' [str] - [ - "beetle" - "audi" - "beetle" - "beetle" - "beetle" - ], \'optional\': shape: (5,) - Series: \'optional\' [i64] - [ - 28 - 300 - null - 2 - -30 - ]} - - ''' - def to_dicts(self) -> list[dict[str, Any]]: - ''' - Convert every row to a dictionary of Python-native values. - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.to_dicts() - [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] - - ''' - def to_numpy(self) -> np.ndarray[Any, Any]: - ''' - Convert DataFrame to a 2D NumPy array. - - This operation clones data. - - Parameters - ---------- - structured - Optionally return a structured array, with field names and - dtypes that correspond to the DataFrame schema. - order - The index order of the returned NumPy array, either C-like or - Fortran-like. In general, using the Fortran-like index order is faster. - However, the C-like order might be more appropriate to use for downstream - applications to prevent cloning data, e.g. when reshaping into a - one-dimensional array. Note that this option only takes effect if - `structured` is set to `False` and the DataFrame dtypes allow for a - global dtype for all columns. - - Notes - ----- - If you\'re attempting to convert Utf8 to an array you\'ll need to install - `pyarrow`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.5, 7.0, 8.5], - ... "ham": ["a", "b", "c"], - ... }, - ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, - ... ) - - Export to a standard 2D numpy array. - - >>> df.to_numpy() - array([[1, 6.5, \'a\'], - [2, 7.0, \'b\'], - [3, 8.5, \'c\']], dtype=object) - - Export to a structured array, which can better-preserve individual - column data, such as name and dtype... - - >>> df.to_numpy(structured=True) - array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], - dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np - >>> df.to_numpy(structured=True).view(np.recarray) - rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], - dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: - ''' - Cast to a pandas DataFrame. - - This requires that :mod:`pandas` and :mod:`pyarrow` are installed. - This operation clones data, unless `use_pyarrow_extension_array=True`. - - Parameters - ---------- - use_pyarrow_extension_array - Use PyArrow backed-extension arrays instead of numpy arrays for each column - of the pandas DataFrame; this allows zero copy operations and preservation - of null values. Subsequent operations on the resulting pandas DataFrame may - trigger conversion to NumPy arrays if that operation is not supported by - pyarrow compute functions. - **kwargs - Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. - - Returns - ------- - :class:`pandas.DataFrame` - - Examples - -------- - >>> import pandas - >>> df1 = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> pandas_df1 = df1.to_pandas() - >>> type(pandas_df1) - - >>> pandas_df1.dtypes - foo int64 - bar int64 - ham object - dtype: object - >>> df2 = pl.DataFrame( - ... { - ... "foo": [1, 2, None], - ... "bar": [6, None, 8], - ... "ham": [None, "b", "c"], - ... } - ... ) - >>> pandas_df2 = df2.to_pandas() - >>> pandas_df2 - foo bar ham - 0 1.0 6.0 None - 1 2.0 NaN b - 2 NaN 8.0 c - >>> pandas_df2.dtypes - foo float64 - bar float64 - ham object - dtype: object - >>> pandas_df2_pa = df2.to_pandas( - ... use_pyarrow_extension_array=True - ... ) # doctest: +SKIP - >>> pandas_df2_pa # doctest: +SKIP - foo bar ham - 0 1 6 - 1 2 b - 2 8 c - >>> pandas_df2_pa.dtypes # doctest: +SKIP - foo int64[pyarrow] - bar int64[pyarrow] - ham large_string[pyarrow] - dtype: object - - ''' - def to_series(self, index: int = ...) -> Series: - ''' - Select column as Series at index location. - - Parameters - ---------- - index - Location of selection. - - See Also - -------- - get_column - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.to_series(1) - shape: (3,) - Series: \'bar\' [i64] - [ - 6 - 7 - 8 - ] - - ''' - def to_init_repr(self, n: int = ...) -> str: - ''' - Convert DataFrame to instantiatable string representation. - - Parameters - ---------- - n - Only use first n rows. - - See Also - -------- - polars.Series.to_init_repr - polars.from_repr - - Examples - -------- - >>> df = pl.DataFrame( - ... [ - ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), - ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), - ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), - ... ] - ... ) - >>> print(df.to_init_repr()) - pl.DataFrame( - [ - pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), - pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), - pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), - ] - ) - - >>> df_from_str_repr = eval(df.to_init_repr()) - >>> df_from_str_repr - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ f32 ┆ cat │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 7.0 ┆ b │ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: - ''' - Serialize to JSON representation. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - If set to `None` (default), the output is returned as a string instead. - pretty - Pretty serialize json. - row_oriented - Write to row oriented json. This is slower, but more common. - - See Also - -------- - DataFrame.write_ndjson - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... } - ... ) - >>> df.write_json() - \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' - >>> df.write_json(row_oriented=True) - \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' - - ''' - def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: - ''' - Serialize to newline delimited JSON representation. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - If set to `None` (default), the output is returned as a string instead. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... } - ... ) - >>> df.write_ndjson() - \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' - - ''' - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: - ''' - Write to comma-separated values (CSV) file. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - If set to `None` (default), the output is returned as a string instead. - include_bom - Whether to include UTF-8 BOM in the CSV output. - include_header - Whether to include header in the CSV output. - separator - Separate CSV fields with this symbol. - line_terminator - String used to end each row. - quote_char - Byte to use as quoting character. - batch_size - Number of rows that will be processed per thread. - datetime_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. If no format specified, the default fractional-second - precision is inferred from the maximum timeunit found in the frame\'s - Datetime cols (if any). - date_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - time_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - float_precision - Number of decimal places to write, applied to both `Float32` and - `Float64` datatypes. - null_value - A string representing null values (defaulting to the empty string). - quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} - Determines the quoting strategy used. - - - necessary (default): This puts quotes around fields only when necessary. - They are necessary when fields contain a quote, - separator or record terminator. - Quotes are also necessary when writing an empty record - (which is indistinguishable from a record with one empty field). - This is the default. - - always: This puts quotes around every field. Always. - - never: This never puts quotes around fields, even if that results in - invalid CSV data (e.g.: by not quoting strings containing the separator). - - non_numeric: This puts quotes around all fields that are non-numeric. - Namely, when writing a field that does not parse as a valid float - or integer, then quotes will be used even if they aren`t strictly - necessary. - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.csv" - >>> df.write_csv(path, separator=",") - - ''' - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: - ''' - Write to Apache Avro file. - - Parameters - ---------- - file - File path or writeable file-like object to which the data will be written. - compression : {\'uncompressed\', \'snappy\', \'deflate\'} - Compression method. Defaults to "uncompressed". - name - Schema name. Defaults to empty string. - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.avro" - >>> df.write_avro(path) - - ''' - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: - ''' - Write frame data to a table in an Excel workbook/worksheet. - - Parameters - ---------- - workbook : Workbook - String name or path of the workbook to create, BytesIO object to write - into, or an open `xlsxwriter.Workbook` object that has not been closed. - If None, writes to a `dataframe.xlsx` workbook in the working directory. - worksheet : str - Name of target worksheet; if None, writes to "Sheet1" when creating a new - workbook (note that writing to an existing workbook requires a valid - existing -or new- worksheet name). - position : {str, tuple} - Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. - table_style : {str, dict} - A named Excel table style, such as "Table Style Medium 4", or a dictionary - of `{"key":value,}` options containing one or more of the following keys: - "style", "first_column", "last_column", "banded_columns, "banded_rows". - table_name : str - Name of the output table object in the worksheet; can then be referred to - in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. - column_formats : dict - A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an - Excel format string to the given columns. Formats defined here (such as - "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. - dtype_formats : dict - A `{dtype:str,}` dictionary that sets the default Excel format for the - given dtype. (This can be overridden on a per-column basis by the - `column_formats` param). It is also valid to use dtype groups such as - `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform - integer and float formats. - conditional_formats : dict - A dictionary of colname (or selector) keys to a format str, dict, or list - that defines conditional formatting options for the specified columns. - - * If supplying a string typename, should be one of the valid `xlsxwriter` - types such as "3_color_scale", "data_bar", etc. - * If supplying a dictionary you can make use of any/all `xlsxwriter` - supported options, including icon sets, formulae, etc. - * Supplying multiple columns as a tuple/key will apply a single format - across all columns - this is effective in creating a heatmap, as the - min/max values will be determined across the entire range, not per-column. - * Finally, you can also supply a list made up from the above options - in order to apply *more* than one conditional format to the same range. - header_format : dict - A `{key:value,}` dictionary of `xlsxwriter` format options to apply - to the table header row, such as `{"bold":True, "font_color":"#702963"}`. - column_totals : {bool, list, dict} - Add a column-total row to the exported table. - - * If True, all numeric columns will have an associated total using "sum". - * If passing a string, it must be one of the valid total function names - and all numeric columns will have an associated total using that function. - * If passing a list of colnames, only those given will have a total. - * For more control, pass a `{colname:funcname,}` dict. - - Valid total function names are "average", "count_nums", "count", "max", - "min", "std_dev", "sum", and "var". - column_widths : {dict, int} - A `{colname:int,}` or `{selector:int,}` dict or a single integer that - sets (or overrides if autofitting) table column widths, in integer pixel - units. If given as an integer the same value is used for all table columns. - row_totals : {dict, bool} - Add a row-total column to the right-hand side of the exported table. - - * If True, a column called "total" will be added at the end of the table - that applies a "sum" function row-wise across all numeric columns. - * If passing a list/sequence of column names, only the matching columns - will participate in the sum. - * Can also pass a `{colname:columns,}` dictionary to create one or - more total columns with distinct names, referencing different columns. - row_heights : {dict, int} - An int or `{row_index:int,}` dictionary that sets the height of the given - rows (if providing a dictionary) or all rows (if providing an integer) that - intersect with the table body (including any header and total row) in - integer pixel units. Note that `row_index` starts at zero and will be - the header row (unless `include_header` is False). - sparklines : dict - A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more - sparklines to be written into a new column in the table. - - * If passing a list of colnames (used as the source of the sparkline data) - the default sparkline settings are used (eg: line chart with no markers). - * For more control an `xlsxwriter`-compliant options dict can be supplied, - in which case three additional polars-specific keys are available: - "columns", "insert_before", and "insert_after". These allow you to define - the source columns and position the sparkline(s) with respect to other - table columns. If no position directive is given, sparklines are added to - the end of the table (eg: to the far right) in the order they are given. - formulas : dict - A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or - more formulas to be written into a new column in the table. Note that you - are strongly advised to use structured references in your formulae wherever - possible to make it simple to reference columns by name. - - * If providing a string formula (such as "=[@colx]*[@coly]") the column will - be added to the end of the table (eg: to the far right), after any default - sparklines and before any row_totals. - * For the most control supply an options dictionary with the following keys: - "formula" (mandatory), one of "insert_before" or "insert_after", and - optionally "return_dtype". The latter is used to appropriately format the - output of the formula and allow it to participate in row/column totals. - float_precision : int - Default number of decimals displayed for floating point columns (note that - this is purely a formatting directive; the actual values are not rounded). - include_header : bool - Indicate if the table should be created with a header row. - autofilter : bool - If the table has headers, provide autofilter capability. - autofit : bool - Calculate individual column widths from the data. - hidden_columns : list - A list or selector representing table columns to hide in the worksheet. - hide_gridlines : bool - Do not display any gridlines on the output worksheet. - sheet_zoom : int - Set the default zoom level of the output worksheet. - freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) - Freeze workbook panes. - - * If (row, col) is supplied, panes are split at the top-left corner of the - specified cell, which are 0-indexed. Thus, to freeze only the top row, - supply (1, 0). - * Alternatively, cell notation can be used to supply the cell. For example, - "A2" indicates the split occurs at the top-left of cell A2, which is the - equivalent of (1, 0). - * If (row, col, top_row, top_col) are supplied, the panes are split based on - the `row` and `col`, and the scrolling region is inititalized to begin at - the `top_row` and `top_col`. Thus, to freeze only the top row and have the - scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). - Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. - - Notes - ----- - * A list of compatible `xlsxwriter` format property names can be found here: - https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties - - * Conditional formatting dictionaries should provide xlsxwriter-compatible - definitions; polars will take care of how they are applied on the worksheet - with respect to the relative sheet/column position. For supported options, - see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html - - * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible - key/values, as well as a mandatory polars "columns" key that defines the - sparkline source data; these source columns should all be adjacent. Two other - polars-specific keys are available to help define where the sparkline appears - in the table: "insert_after", and "insert_before". The value associated with - these keys should be the name of a column in the exported table. - https://xlsxwriter.readthedocs.io/working_with_sparklines.html - - * Formula dictionaries *must* contain a key called "formula", and then optional - "insert_after", "insert_before", and/or "return_dtype" keys. These additional - keys allow the column to be injected into the table at a specific location, - and/or to define the return type of the formula (eg: "Int64", "Float64", etc). - Formulas that refer to table columns should use Excel\'s structured references - syntax to ensure the formula is applied correctly and is table-relative. - https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e - - Examples - -------- - Instantiate a basic DataFrame: - - >>> from random import uniform - >>> from datetime import date - >>> - >>> df = pl.DataFrame( - ... { - ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], - ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], - ... "val": [10_000, 20_000, 30_000], - ... } - ... ) - - Export to "dataframe.xlsx" (the default workbook name, if not specified) in the - working directory, add column totals ("sum" by default) on all numeric columns, - then autofit: - - >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP - - Write frame to a specific location on the sheet, set a named table style, - apply US-style date formatting, increase default float precision, apply a - non-default total function to a single column, autofit: - - >>> df.write_excel( # doctest: +SKIP - ... position="B4", - ... table_style="Table Style Light 16", - ... dtype_formats={pl.Date: "mm/dd/yyyy"}, - ... column_totals={"num": "average"}, - ... float_precision=6, - ... autofit=True, - ... ) - - Write the same frame to a named worksheet twice, applying different styles - and conditional formatting to each table, adding table titles using explicit - xlsxwriter integration: - - >>> from xlsxwriter import Workbook - >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP - ... # basic/default conditional formatting - ... df.write_excel( - ... workbook=wb, - ... worksheet="data", - ... position=(3, 1), # specify position as (row,col) coordinates - ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, - ... table_style="Table Style Medium 4", - ... ) - ... - ... # advanced conditional formatting, custom styles - ... df.write_excel( - ... workbook=wb, - ... worksheet="data", - ... position=(len(df) + 7, 1), - ... table_style={ - ... "style": "Table Style Light 4", - ... "first_column": True, - ... }, - ... conditional_formats={ - ... "num": { - ... "type": "3_color_scale", - ... "min_color": "#76933c", - ... "mid_color": "#c4d79b", - ... "max_color": "#ebf1de", - ... }, - ... "val": { - ... "type": "data_bar", - ... "data_bar_2010": True, - ... "bar_color": "#9bbb59", - ... "bar_negative_color_same": True, - ... "bar_negative_border_color_same": True, - ... }, - ... }, - ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, - ... column_widths={"val": 125}, - ... autofit=True, - ... ) - ... - ... # add some table titles (with a custom format) - ... ws = wb.get_worksheet_by_name("data") - ... fmt_title = wb.add_format( - ... { - ... "font_color": "#4f6228", - ... "font_size": 12, - ... "italic": True, - ... "bold": True, - ... } - ... ) - ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) - ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) - ... - - Export a table containing two different types of sparklines. Use default - options for the "trend" sparkline and customised options (and positioning) - for the "+/-" win_loss sparkline, with non-default integer dtype formatting, - column totals, a subtle two-tone heatmap and hidden worksheet gridlines: - - >>> df = pl.DataFrame( - ... { - ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], - ... "q1": [100, 55, -20, 0, 35], - ... "q2": [30, -10, 15, 60, 20], - ... "q3": [-50, 0, 40, 80, 80], - ... "q4": [75, 55, 25, -10, -55], - ... } - ... ) - >>> df.write_excel( # doctest: +SKIP - ... table_style="Table Style Light 2", - ... # apply accounting format to all flavours of integer - ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, - ... sparklines={ - ... # default options; just provide source cols - ... "trend": ["q1", "q2", "q3", "q4"], - ... # customised sparkline type, with positioning directive - ... "+/-": { - ... "columns": ["q1", "q2", "q3", "q4"], - ... "insert_after": "id", - ... "type": "win_loss", - ... }, - ... }, - ... conditional_formats={ - ... # create a unified multi-column heatmap - ... ("q1", "q2", "q3", "q4"): { - ... "type": "2_color_scale", - ... "min_color": "#95b3d7", - ... "max_color": "#ffffff", - ... }, - ... }, - ... column_totals=["q1", "q2", "q3", "q4"], - ... row_totals=True, - ... hide_gridlines=True, - ... ) - - Export a table containing an Excel formula-based column that calculates a - standardised Z-score, showing use of structured references in conjunction - with positioning directives, column totals, and custom formatting. - - >>> df = pl.DataFrame( - ... { - ... "id": ["a123", "b345", "c567", "d789", "e101"], - ... "points": [99, 45, 50, 85, 35], - ... } - ... ) - >>> df.write_excel( # doctest: +SKIP - ... table_style={ - ... "style": "Table Style Medium 15", - ... "first_column": True, - ... }, - ... column_formats={ - ... "id": {"font": "Consolas"}, - ... "points": {"align": "center"}, - ... "z-score": {"align": "center"}, - ... }, - ... column_totals="average", - ... formulas={ - ... "z-score": { - ... # use structured references to refer to the table columns and \'totals\' row - ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", - ... "insert_after": "points", - ... "return_dtype": pl.Float64, - ... } - ... }, - ... hide_gridlines=True, - ... sheet_zoom=125, - ... ) - - ''' - def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: - ''' - Write to Arrow IPC binary stream or Feather file. - - See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. - - Parameters - ---------- - file - Path or writeable file-like object to which the IPC data will be - written. If set to `None`, the output is returned as a BytesIO object. - compression : {\'uncompressed\', \'lz4\', \'zstd\'} - Compression method. Defaults to "uncompressed". - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.arrow" - >>> df.write_ipc(path) - - ''' - def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: - ''' - Write to Arrow IPC record batch stream. - - See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. - - Parameters - ---------- - file - Path or writeable file-like object to which the IPC record batch data will - be written. If set to `None`, the output is returned as a BytesIO object. - compression : {\'uncompressed\', \'lz4\', \'zstd\'} - Compression method. Defaults to "uncompressed". - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.arrow" - >>> df.write_ipc_stream(path) - - ''' - def write_parquet(self, file: str | Path | BytesIO) -> None: - ''' - Write to Apache Parquet file. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} - Choose "zstd" for good compression performance. - Choose "lz4" for fast compression/decompression. - Choose "snappy" for more backwards compatibility guarantees - when you deal with older parquet readers. - compression_level - The level of compression to use. Higher compression means smaller files on - disk. - - - "gzip" : min-level: 0, max-level: 10. - - "brotli" : min-level: 0, max-level: 11. - - "zstd" : min-level: 1, max-level: 22. - - statistics - Write statistics to the parquet headers. This requires extra compute. - row_group_size - Size of the row groups in number of rows. Defaults to 512^2 rows. - use_pyarrow - Use C++ parquet implementation vs Rust parquet implementation. - At the moment C++ supports more features. - pyarrow_options - Arguments passed to `pyarrow.parquet.write_table`. - - If you pass `partition_cols` here, the dataset will be written - using `pyarrow.parquet.write_to_dataset`. - The `partition_cols` parameter leads to write the dataset to a directory. - Similar to Spark\'s partitioned datasets. - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.parquet" - >>> df.write_parquet(path) - - We can use pyarrow with use_pyarrow_write_to_dataset=True - to write partitioned datasets. The following example will - write the first row to ../watermark=1/*.parquet and the - other rows to ../watermark=2/*.parquet. - - >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) - >>> path: pathlib.Path = dirpath / "partitioned_object" - >>> df.write_parquet( - ... path, - ... use_pyarrow=True, - ... pyarrow_options={"partition_cols": ["watermark"]}, - ... ) - - ''' - def write_database(self, table_name: str, connection: str) -> None: - ''' - Write a polars frame to a database. - - Parameters - ---------- - table_name - Name of the table to create or append to in the target SQL database. - If your table name contains special characters, it should be quoted. - connection - Connection URI string, for example: - - * "postgresql://user:pass@server:port/database" - * "sqlite:////path/to/database.db" - if_exists : {\'append\', \'replace\', \'fail\'} - The insert mode. - \'replace\' will create a new database table, overwriting an existing one. - \'append\' will append to an existing table. - \'fail\' will fail if table already exists. - engine : {\'sqlalchemy\', \'adbc\'} - Select the engine used for writing the data. - ''' - def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: - ''' - Write DataFrame as delta table. - - Parameters - ---------- - target - URI of a table or a DeltaTable object. - mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} - How to handle existing data. - - * If \'error\', throw an error if the table already exists (default). - * If \'append\', will add new data. - * If \'overwrite\', will replace table with new data. - * If \'ignore\', will not write anything if table already exists. - overwrite_schema - If True, allows updating the schema of the table. - storage_options - Extra options for the storage backends supported by `deltalake`. - For cloud storages, this may include configurations for authentication etc. - - * See a list of supported storage options for S3 `here `__. - * See a list of supported storage options for GCS `here `__. - * See a list of supported storage options for Azure `here `__. - delta_write_options - Additional keyword arguments while writing a Delta lake Table. - See a list of supported write options `here `__. - - Raises - ------ - TypeError - If the DataFrame contains unsupported data types. - ArrowInvalidError - If the DataFrame contains data types that could not be cast to their - primitive type. - - Notes - ----- - The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` - are not supported by the delta protocol specification and will raise a - TypeError. - - Some other data types are not supported but have an associated `primitive type - `__ - to which they can be cast. This affects the following data types: - - - Unsigned integers - - :class:`Datetime` types with millisecond or nanosecond precision or with - time zone information - - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) - - Polars columns are always nullable. To write data to a delta table with - non-nullable columns, a custom pyarrow schema has to be passed to the - `delta_write_options`. See the last example below. - - Examples - -------- - Write a dataframe to the local filesystem as a Delta Lake table. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> table_path = "/path/to/delta-table/" - >>> df.write_delta(table_path) # doctest: +SKIP - - Append data to an existing Delta Lake table on the local filesystem. - Note that this will fail if the schema of the new data does not match the - schema of the existing table. - - >>> df.write_delta(table_path, mode="append") # doctest: +SKIP - - Overwrite a Delta Lake table as a new version. - If the schemas of the new and old data are the same, setting - `overwrite_schema` is not required. - - >>> existing_table_path = "/path/to/delta-table/" - >>> df.write_delta( - ... existing_table_path, mode="overwrite", overwrite_schema=True - ... ) # doctest: +SKIP - - Write a dataframe as a Delta Lake table to a cloud object store like S3. - - >>> table_path = "s3://bucket/prefix/to/delta-table/" - >>> df.write_delta( - ... table_path, - ... storage_options={ - ... "AWS_REGION": "THE_AWS_REGION", - ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", - ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", - ... }, - ... ) # doctest: +SKIP - - Write DataFrame as a Delta Lake table with non-nullable columns. - - >>> import pyarrow as pa - >>> existing_table_path = "/path/to/delta-table/" - >>> df.write_delta( - ... existing_table_path, - ... delta_write_options={ - ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) - ... }, - ... ) # doctest: +SKIP - - ''' - def estimated_size(self, unit: SizeUnit = ...) -> int | float: - ''' - Return an estimation of the total (heap) allocated size of the `DataFrame`. - - Estimated size is given in the specified unit (bytes by default). - - This estimation is the sum of the size of its buffers, validity, including - nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the - size of 2 arrays is not the sum of the sizes computed from this function. In - particular, [`StructArray`]\'s size is an upper bound. - - When an array is sliced, its allocated size remains constant because the buffer - unchanged. However, this function will yield a smaller number. This is because - this function returns the visible size of the buffer, not its total capacity. - - FFI buffers are included in this estimation. - - Parameters - ---------- - unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} - Scale the returned size to the given unit. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "x": list(reversed(range(1_000_000))), - ... "y": [v / 1000 for v in range(1_000_000)], - ... "z": [str(v) for v in range(1_000_000)], - ... }, - ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], - ... ) - >>> df.estimated_size() - 25888898 - >>> df.estimated_size("mb") - 24.689577102661133 - - ''' - def transpose(self) -> Self: - ''' - Transpose a DataFrame over the diagonal. - - Parameters - ---------- - include_header - If set, the column names will be added as first column. - header_name - If `include_header` is set, this determines the name of the column that will - be inserted. - column_names - Optional iterable yielding strings or a string naming an existing column. - These will name the value (non-header) columns in the transposed data. - - Notes - ----- - This is a very expensive operation. Perhaps you can do it differently. - - Returns - ------- - DataFrame - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) - >>> df.transpose(include_header=True) - shape: (2, 4) - ┌────────┬──────────┬──────────┬──────────┐ - │ column ┆ column_0 ┆ column_1 ┆ column_2 │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞════════╪══════════╪══════════╪══════════╡ - │ a ┆ 1 ┆ 2 ┆ 3 │ - │ b ┆ 1 ┆ 2 ┆ 3 │ - └────────┴──────────┴──────────┴──────────┘ - - Replace the auto-generated column names with a list - - >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 2 ┆ 3 │ - │ 1 ┆ 2 ┆ 3 │ - └─────┴─────┴─────┘ - - Include the header as a separate column - - >>> df.transpose( - ... include_header=True, header_name="foo", column_names=["a", "b", "c"] - ... ) - shape: (2, 4) - ┌─────┬─────┬─────┬─────┐ - │ foo ┆ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═════╡ - │ a ┆ 1 ┆ 2 ┆ 3 │ - │ b ┆ 1 ┆ 2 ┆ 3 │ - └─────┴─────┴─────┴─────┘ - - Replace the auto-generated column with column names from a generator function - - >>> def name_generator(): - ... base_name = "my_column_" - ... count = 0 - ... while True: - ... yield f"{base_name}{count}" - ... count += 1 - ... - >>> df.transpose(include_header=False, column_names=name_generator()) - shape: (2, 3) - ┌─────────────┬─────────────┬─────────────┐ - │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════════════╪═════════════╪═════════════╡ - │ 1 ┆ 2 ┆ 3 │ - │ 1 ┆ 2 ┆ 3 │ - └─────────────┴─────────────┴─────────────┘ - - Use an existing column as the new column names - - >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) - >>> df.transpose(column_names="id") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 3 ┆ 2 │ - │ 3 ┆ 4 ┆ 6 │ - └─────┴─────┴─────┘ - >>> df.transpose(include_header=True, header_name="new_id", column_names="id") - shape: (2, 4) - ┌────────┬─────┬─────┬─────┐ - │ new_id ┆ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞════════╪═════╪═════╪═════╡ - │ col1 ┆ 1 ┆ 3 ┆ 2 │ - │ col2 ┆ 3 ┆ 4 ┆ 6 │ - └────────┴─────┴─────┴─────┘ - ''' - def reverse(self) -> DataFrame: - ''' - Reverse the DataFrame. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "key": ["a", "b", "c"], - ... "val": [1, 2, 3], - ... } - ... ) - >>> df.reverse() - shape: (3, 2) - ┌─────┬─────┐ - │ key ┆ val │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ c ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 1 │ - └─────┴─────┘ - - ''' - def rename(self, mapping: dict[str, str]) -> DataFrame: - ''' - Rename column names. - - Parameters - ---------- - mapping - Key value pairs that map from old name to new name. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} - ... ) - >>> df.rename({"foo": "apple"}) - shape: (3, 3) - ┌───────┬─────┬─────┐ - │ apple ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═══════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └───────┴─────┴─────┘ - - ''' - def insert_column(self, index: int, column: Series) -> Self: - ''' - Insert a Series at a certain column index. - - This operation is in place. - - Parameters - ---------- - index - Index at which to insert the new `Series` column. - column - `Series` to insert. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> s = pl.Series("baz", [97, 98, 99]) - >>> df.insert_column(1, s) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ baz ┆ bar │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 97 ┆ 4 │ - │ 2 ┆ 98 ┆ 5 │ - │ 3 ┆ 99 ┆ 6 │ - └─────┴─────┴─────┘ - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) - >>> df.insert_column(3, s) - shape: (4, 4) - ┌─────┬──────┬───────┬──────┐ - │ a ┆ b ┆ c ┆ d │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 │ - ╞═════╪══════╪═══════╪══════╡ - │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ - │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ - │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ - │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ - └─────┴──────┴───────┴──────┘ - - ''' - def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: - ''' - Filter the rows in the DataFrame based on a predicate expression. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - predicates - Expression that evaluates to a boolean Series. - constraints - Column filters. Use name=value to filter column name by the supplied value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - - Filter on one condition: - - >>> df.filter(pl.col("foo") > 1) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Filter on multiple conditions, combined with and/or operators: - - >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Provide multiple filters using `*args` syntax: - - >>> df.filter( - ... pl.col("foo") <= 2, - ... ~pl.col("ham").is_in(["b", "c"]), - ... ) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Provide multiple filters using `**kwargs` syntax: - - >>> df.filter(foo=2, ham="b") - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - └─────┴─────┴─────┘ - - ''' - def glimpse(self) -> str | None: - ''' - Return a dense preview of the DataFrame. - - The formatting shows one line per column so that wide dataframes display - cleanly. Each line shows the column name, the data type, and the first - few values. - - Parameters - ---------- - max_items_per_column - Maximum number of items to show per column. - max_colname_length - Maximum length of the displayed column names; values that exceed this - value are truncated with a trailing ellipsis. - return_as_string - If True, return the preview as a string instead of printing to stdout. - - See Also - -------- - describe, head, tail - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... { - ... "a": [1.0, 2.8, 3.0], - ... "b": [4, 5, None], - ... "c": [True, False, True], - ... "d": [None, "b", "c"], - ... "e": ["usd", "eur", None], - ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], - ... } - ... ) - >>> df.glimpse() - Rows: 3 - Columns: 6 - $ a 1.0, 2.8, 3.0 - $ b 4, 5, None - $ c True, False, True - $ d None, \'b\', \'c\' - $ e \'usd\', \'eur\', None - $ f 2020-01-01, 2021-01-02, 2022-01-01 - - ''' - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: - ''' - Summary statistics for a DataFrame. - - Parameters - ---------- - percentiles - One or more percentiles to include in the summary statistics. - All values must be in the range `[0, 1]`. - - Notes - ----- - The median is included by default as the 50% percentile. - - See Also - -------- - glimpse - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... { - ... "a": [1.0, 2.8, 3.0], - ... "b": [4, 5, None], - ... "c": [True, False, True], - ... "d": [None, "b", "c"], - ... "e": ["usd", "eur", None], - ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], - ... } - ... ) - >>> df.describe() - shape: (9, 7) - ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ - │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ - ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ - │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ - │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ - │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ - │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ - │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ - │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ - │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ - │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ - │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ - └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ - - ''' - def get_column_index(self, name: str) -> int: - ''' - Find the index of a column by name. - - Parameters - ---------- - name - Name of the column to find. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} - ... ) - >>> df.get_column_index("ham") - 2 - - ''' - def replace_column(self, index: int, column: Series) -> Self: - ''' - Replace a column at an index location. - - This operation is in place. - - Parameters - ---------- - index - Column index. - column - Series that will replace the column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> s = pl.Series("apple", [10, 20, 30]) - >>> df.replace_column(0, s) - shape: (3, 3) - ┌───────┬─────┬─────┐ - │ apple ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═══════╪═════╪═════╡ - │ 10 ┆ 6 ┆ a │ - │ 20 ┆ 7 ┆ b │ - │ 30 ┆ 8 ┆ c │ - └───────┴─────┴─────┘ - ''' - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: - ''' - Sort the dataframe by the given columns. - - Parameters - ---------- - by - Column(s) to sort by. Accepts expression input. Strings are parsed as column - names. - *more_by - Additional columns to sort by, specified as positional arguments. - descending - Sort in descending order. When sorting by multiple columns, can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - - Examples - -------- - Pass a single column name to sort by that column. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [6.0, 5.0, 4.0], - ... "c": ["a", "c", "b"], - ... } - ... ) - >>> df.sort("a") - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - Sorting by expressions is also supported. - - >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - └──────┴─────┴─────┘ - - Sort by multiple columns by passing a list of columns. - - >>> df.sort(["c", "a"], descending=True) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - └──────┴─────┴─────┘ - - Or use positional arguments to sort by multiple columns in the same way. - - >>> df.sort("c", "a", descending=[False, True]) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - ''' - def top_k(self, k: int) -> DataFrame: - ''' - Return the `k` largest elements. - - If \'descending=True` the smallest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - See Also - -------- - bottom_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 largest values in column b. - - >>> df.top_k(4, by="b") - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ a ┆ 2 │ - │ b ┆ 2 │ - │ b ┆ 1 │ - └─────┴─────┘ - - Get the rows which contain the 4 largest values when sorting on column b and a. - - >>> df.top_k(4, by=["b", "a"]) - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 2 │ - │ c ┆ 1 │ - └─────┴─────┘ - - ''' - def bottom_k(self, k: int) -> DataFrame: - ''' - Return the `k` smallest elements. - - If \'descending=True` the largest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - See Also - -------- - top_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 smallest values in column b. - - >>> df.bottom_k(4, by="b") - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 1 │ - │ a ┆ 1 │ - │ c ┆ 1 │ - │ a ┆ 2 │ - └─────┴─────┘ - - Get the rows which contain the 4 smallest values when sorting on column a and b. - - >>> df.bottom_k(4, by=["a", "b"]) - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ b ┆ 1 │ - │ b ┆ 2 │ - └─────┴─────┘ - - ''' - def equals(self, other: DataFrame) -> bool: - ''' - Check whether the DataFrame is equal to another DataFrame. - - Parameters - ---------- - other - DataFrame to compare with. - null_equal - Consider null values as equal. - - See Also - -------- - assert_frame_equal - - Examples - -------- - >>> df1 = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df2 = pl.DataFrame( - ... { - ... "foo": [3, 2, 1], - ... "bar": [8.0, 7.0, 6.0], - ... "ham": ["c", "b", "a"], - ... } - ... ) - >>> df1.equals(df1) - True - >>> df1.equals(df2) - False - - ''' - def replace(self, column: str, new_column: Series) -> Self: - ''' - Replace a column by a new Series. - - Parameters - ---------- - column - Column to replace. - new_column - New column to insert. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> s = pl.Series([10, 20, 30]) - >>> df.replace("foo", s) # works in-place! # doctest: +SKIP - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 10 ┆ 4 │ - │ 20 ┆ 5 │ - │ 30 ┆ 6 │ - └─────┴─────┘ - - ''' - def slice(self, offset: int, length: int | None = ...) -> Self: - ''' - Get a slice of this DataFrame. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.slice(1, 2) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7.0 ┆ b │ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def head(self, n: int = ...) -> Self: - ''' - Get the first `n` rows. - - Parameters - ---------- - n - Number of rows to return. If a negative value is passed, return all rows - except the last `abs(n)`. - - See Also - -------- - tail, glimpse, slice - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> df.head(3) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Pass a negative value to get all rows `except` the last `abs(n)`. - - >>> df.head(-3) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - └─────┴─────┴─────┘ - - ''' - def tail(self, n: int = ...) -> Self: - ''' - Get the last `n` rows. - - Parameters - ---------- - n - Number of rows to return. If a negative value is passed, return all rows - except the first `abs(n)`. - - See Also - -------- - head, slice - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> df.tail(3) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8 ┆ c │ - │ 4 ┆ 9 ┆ d │ - │ 5 ┆ 10 ┆ e │ - └─────┴─────┴─────┘ - - Pass a negative value to get all rows `except` the first `abs(n)`. - - >>> df.tail(-3) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 4 ┆ 9 ┆ d │ - │ 5 ┆ 10 ┆ e │ - └─────┴─────┴─────┘ - - ''' - def limit(self, n: int = ...) -> Self: - """ - Get the first `n` rows. - - Alias for :func:`DataFrame.head`. - - Parameters - ---------- - n - Number of rows to return. If a negative value is passed, return all rows - except the last `abs(n)`. - - See Also - -------- - head - - """ - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: - ''' - Drop all rows that contain null values. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - subset - Column name(s) for which null values are considered. - If set to `None` (default), use all columns. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, None, 8], - ... "ham": ["a", "b", None], - ... } - ... ) - - The default behavior of this method is to drop rows where any single - value of the row is null. - - >>> df.drop_nulls() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - This behaviour can be constrained to consider only a subset of columns, as - defined by name or with a selector. For example, dropping rows if there is - a null in any of the integer columns: - - >>> import polars.selectors as cs - >>> df.drop_nulls(subset=cs.integer()) - shape: (2, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ null │ - └─────┴─────┴──────┘ - - Below are some additional examples that show how to drop null - values based on other conditions. - - >>> df = pl.DataFrame( - ... { - ... "a": [None, None, None, None], - ... "b": [1, 2, None, 1], - ... "c": [1, None, None, 1], - ... } - ... ) - >>> df - shape: (4, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪══════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ null ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴──────┴──────┘ - - Drop a row only if all values are null: - - >>> df.filter(~pl.all_horizontal(pl.all().is_null())) - shape: (3, 3) - ┌──────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪═════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴─────┴──────┘ - - Drop a column if all values are null: - - >>> df[[s.name for s in df if not (s.null_count() == df.height)]] - shape: (4, 2) - ┌──────┬──────┐ - │ b ┆ c │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ 1 ┆ 1 │ - │ 2 ┆ null │ - │ null ┆ null │ - │ 1 ┆ 1 │ - └──────┴──────┘ - - ''' - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: - ''' - Offers a structured way to apply a sequence of user-defined functions (UDFs). - - Parameters - ---------- - function - Callable; will receive the frame as the first parameter, - followed by any given args/kwargs. - *args - Arguments to pass to the UDF. - **kwargs - Keyword arguments to pass to the UDF. - - Notes - ----- - It is recommended to use LazyFrame when piping operations, in order - to fully take advantage of query optimization and parallelization. - See :meth:`df.lazy() `. - - Examples - -------- - >>> def cast_str_to_int(data, col_name): - ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) - ... - >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) - >>> df.pipe(cast_str_to_int, col_name="b") - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 10 │ - │ 2 ┆ 20 │ - │ 3 ┆ 30 │ - │ 4 ┆ 40 │ - └─────┴─────┘ - - >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) - >>> df - shape: (2, 2) - ┌─────┬─────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - └─────┴─────┘ - >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 1 │ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: - ''' - Add a column at index 0 that counts the rows. - - Parameters - ---------- - name - Name of the column to add. - offset - Start the row count at this offset. Default = 0 - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> df.with_row_count() - shape: (3, 3) - ┌────────┬─────┬─────┐ - │ row_nr ┆ a ┆ b │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ i64 ┆ i64 │ - ╞════════╪═════╪═════╡ - │ 0 ┆ 1 ┆ 2 │ - │ 1 ┆ 3 ┆ 4 │ - │ 2 ┆ 5 ┆ 6 │ - └────────┴─────┴─────┘ - - ''' - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: - ''' - Start a group by operation. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - .. note:: - Within each group, the order of rows is always preserved, regardless - of this argument. - - Returns - ------- - GroupBy - Object which can be used to perform aggregations. - - Examples - -------- - Group by one column and call `agg` to compute the grouped sum of another - column. - - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "c"], - ... "b": [1, 2, 1, 3, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 2 │ - │ b ┆ 5 │ - │ c ┆ 3 │ - └─────┴─────┘ - - Set `maintain_order=True` to ensure the order of the groups is consistent with - the input. - - >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) - shape: (3, 2) - ┌─────┬───────────┐ - │ a ┆ c │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════╪═══════════╡ - │ a ┆ [5, 3] │ - │ b ┆ [4, 2] │ - │ c ┆ [1] │ - └─────┴───────────┘ - - Group by multiple columns by passing a list of column names. - - >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - - Or use positional arguments to group by multiple columns in the same way. - Expressions are also accepted. - - >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ f64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 0 ┆ 4.0 │ - │ b ┆ 1 ┆ 3.0 │ - │ c ┆ 1 ┆ 1.0 │ - └─────┴─────┴─────┘ - - The `GroupBy` object returned by this method is iterable, returning the name - and data of each group. - - >>> for name, data in df.group_by("a"): # doctest: +SKIP - ... print(name) - ... print(data) - ... - a - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘ - b - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘ - c - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - - ''' - def rolling(self, index_column: IntoExpr) -> RollingGroupBy: - ''' - Create rolling groups based on a time, Int32, or Int64 column. - - Different from a `group_by_dynamic` the windows are now determined by the - individual values and are not of constant intervals. For constant intervals use - :func:`DataFrame.group_by_dynamic`. - - If you have a time series ``, then by default the - windows created will be - - * (t_0 - period, t_0] - * (t_1 - period, t_1] - * ... - * (t_n - period, t_n] - - whereas if you pass a non-default `offset`, then the windows will be - - * (t_0 + offset, t_0 + offset + period] - * (t_1 + offset, t_1 + offset + period] - * ... - * (t_n + offset, t_n + offset + period] - - The `period` and `offset` arguments are created either from a timedelta, or - by using the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a rolling operation on an integer column, the windows are defined by: - - - **"1i" # length 1** - - **"10i" # length 10** - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling operation on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - RollingGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - group_by_dynamic - - Examples - -------- - >>> dates = [ - ... "2020-01-01 13:45:48", - ... "2020-01-01 16:42:13", - ... "2020-01-01 16:45:09", - ... "2020-01-02 18:12:48", - ... "2020-01-03 19:45:32", - ... "2020-01-08 23:16:43", - ... ] - >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( - ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() - ... ) - >>> out = df.rolling(index_column="dt", period="2d").agg( - ... [ - ... pl.sum("a").alias("sum_a"), - ... pl.min("a").alias("min_a"), - ... pl.max("a").alias("max_a"), - ... ] - ... ) - >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] - >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] - >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] - >>> out - shape: (6, 4) - ┌─────────────────────┬───────┬───────┬───────┐ - │ dt ┆ sum_a ┆ min_a ┆ max_a │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞═════════════════════╪═══════╪═══════╪═══════╡ - │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ - │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ - │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ - │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ - │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ - │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ - └─────────────────────┴───────┴───────┴───────┘ - - ''' - def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - Time windows are calculated and rows are assigned to windows. Different from a - normal group by is that a row can be member of multiple groups. - By default, the windows look like: - - - [start, start + period) - - [start + every, start + every + period) - - [start + 2*every, start + 2*every + period) - - ... - - where `start` is determined by `start_by`, `offset`, and `every` (see parameter - descriptions below). - - .. warning:: - The index column must be sorted in ascending order. If `by` is passed, then - the index column must be sorted in ascending order within each group. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - - .. deprecated:: 0.19.4 - Use `label` instead. - include_boundaries - Add the lower and upper bound of the window to the "_lower_boundary" and - "_upper_boundary" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - label : {\'left\', \'right\', \'datapoint\'} - Define which label to use for the window: - - - \'left\': lower boundary of the window - - \'right\': upper boundary of the window - - \'datapoint\': the first value of the index column in the given window. - If you don\'t need the label to be at one of the boundaries, choose this - option for maximum performance - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - DynamicGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - rolling - - Notes - ----- - 1) If you\'re coming from pandas, then - - .. code-block:: python - - # polars - df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) - - is equivalent to - - .. code-block:: python - - # pandas - df.set_index("ts").resample("D")["value"].sum().reset_index() - - though note that, unlike pandas, polars doesn\'t add extra rows for empty - windows. If you need `index_column` to be evenly spaced, then please combine - with :func:`DataFrame.upsample`. - - 2) The `every`, `period` and `offset` arguments are created with - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a group_by_dynamic on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Examples - -------- - >>> from datetime import datetime - >>> df = pl.DataFrame( - ... { - ... "time": pl.datetime_range( - ... start=datetime(2021, 12, 16), - ... end=datetime(2021, 12, 16, 3), - ... interval="30m", - ... eager=True, - ... ), - ... "n": range(7), - ... } - ... ) - >>> df - shape: (7, 2) - ┌─────────────────────┬─────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i64 │ - ╞═════════════════════╪═════╡ - │ 2021-12-16 00:00:00 ┆ 0 │ - │ 2021-12-16 00:30:00 ┆ 1 │ - │ 2021-12-16 01:00:00 ┆ 2 │ - │ 2021-12-16 01:30:00 ┆ 3 │ - │ 2021-12-16 02:00:00 ┆ 4 │ - │ 2021-12-16 02:30:00 ┆ 5 │ - │ 2021-12-16 03:00:00 ┆ 6 │ - └─────────────────────┴─────┘ - - Group by windows of 1 hour starting at 2021-12-16 00:00:00. - - >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [1, 2] │ - │ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ 2021-12-16 02:00:00 ┆ [5, 6] │ - └─────────────────────┴───────────┘ - - The window boundaries can also be added to the aggregation result - - >>> df.group_by_dynamic( - ... "time", every="1h", include_boundaries=True, closed="right" - ... ).agg(pl.col("n").mean()) - shape: (4, 4) - ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ - │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ - ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ - │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ - │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ - │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ - │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ - └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ - - When closed="left", the window excludes the right end of interval: - [lower_bound, upper_bound) - - >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-16 00:00:00 ┆ [0, 1] │ - │ 2021-12-16 01:00:00 ┆ [2, 3] │ - │ 2021-12-16 02:00:00 ┆ [4, 5] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - When closed="both" the time values at the window boundaries belong to 2 groups. - - >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) - shape: (5, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ - │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - Dynamic group bys can also be combined with grouping on normal keys - - >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) - >>> df - shape: (7, 3) - ┌─────────────────────┬─────┬────────┐ - │ time ┆ n ┆ groups │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ str │ - ╞═════════════════════╪═════╪════════╡ - │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ - │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ - │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ - │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ - │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ - │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ - │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ - └─────────────────────┴─────┴────────┘ - >>> df.group_by_dynamic( - ... "time", - ... every="1h", - ... closed="both", - ... by="groups", - ... include_boundaries=True, - ... ).agg(pl.col("n")) - shape: (7, 5) - ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ - │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ - ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ - │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ - │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ - │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ - │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ - │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ - └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ - - Dynamic group by on an index column - - >>> df = pl.DataFrame( - ... { - ... "idx": pl.int_range(0, 6, eager=True), - ... "A": ["A", "A", "B", "B", "B", "C"], - ... } - ... ) - >>> ( - ... df.group_by_dynamic( - ... "idx", - ... every="2i", - ... period="3i", - ... include_boundaries=True, - ... closed="right", - ... ).agg(pl.col("A").alias("A_agg_list")) - ... ) - shape: (4, 4) - ┌─────────────────┬─────────────────┬─────┬─────────────────┐ - │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 ┆ list[str] │ - ╞═════════════════╪═════════════════╪═════╪═════════════════╡ - │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ - │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ - │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ - │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ - └─────────────────┴─────────────────┴─────┴─────────────────┘ - - ''' - def upsample(self, time_column: str) -> Self: - ''' - Upsample a DataFrame at a regular frequency. - - The `every` and `offset` arguments are created with - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - - - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - Parameters - ---------- - time_column - time column will be used to determine a date_range. - Note that this column has to be sorted for the output to make sense. - every - interval will start \'every\' duration - offset - change the start of the date_range by this offset. - by - First group by these columns and then upsample for every group - maintain_order - Keep the ordering predictable. This is slower. - - Returns - ------- - DataFrame - Result will be sorted by `time_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - Examples - -------- - Upsample a DataFrame by a certain interval. - - >>> from datetime import datetime - >>> df = pl.DataFrame( - ... { - ... "time": [ - ... datetime(2021, 2, 1), - ... datetime(2021, 4, 1), - ... datetime(2021, 5, 1), - ... datetime(2021, 6, 1), - ... ], - ... "groups": ["A", "B", "A", "B"], - ... "values": [0, 1, 2, 3], - ... } - ... ).set_sorted("time") - >>> df.upsample( - ... time_column="time", every="1mo", by="groups", maintain_order=True - ... ).select(pl.all().forward_fill()) - shape: (7, 3) - ┌─────────────────────┬────────┬────────┐ - │ time ┆ groups ┆ values │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ str ┆ i64 │ - ╞═════════════════════╪════════╪════════╡ - │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ - │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ - │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ - │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ - │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ - │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ - │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ - └─────────────────────┴────────┴────────┘ - - ''' - def join_asof(self, other: DataFrame) -> DataFrame: - ''' - Perform an asof join. - - This is similar to a left-join except that we match on nearest key rather than - equal keys. - - Both DataFrames must be sorted by the asof_join key. - - For each row in the left DataFrame: - - - A "backward" search selects the last row in the right DataFrame whose - \'on\' key is less than or equal to the left\'s key. - - - A "forward" search selects the first row in the right DataFrame whose - \'on\' key is greater than or equal to the left\'s key. - - - A "nearest" search selects the last row in the right DataFrame whose value - is nearest to the left\'s key. String keys are not currently supported for a - nearest search. - - The default is "backward". - - Parameters - ---------- - other - Lazy DataFrame to join with. - left_on - Join column of the left DataFrame. - right_on - Join column of the right DataFrame. - on - Join column of both DataFrames. If set, `left_on` and `right_on` should be - None. - by - join on these columns before doing asof join - by_left - join on these columns before doing asof join - by_right - join on these columns before doing asof join - strategy : {\'backward\', \'forward\', \'nearest\'} - Join strategy. - suffix - Suffix to append to columns with a duplicate name. - tolerance - Numeric tolerance. By setting this the join will only be done if the near - keys are within this distance. If an asof join is done on columns of dtype - "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta - object or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - allow_parallel - Allow the physical plan to optionally evaluate the computation of both - DataFrames up to the join in parallel. - force_parallel - Force the physical plan to evaluate the computation of both DataFrames up to - the join in parallel. - - Examples - -------- - >>> from datetime import datetime - >>> gdp = pl.DataFrame( - ... { - ... "date": [ - ... datetime(2016, 1, 1), - ... datetime(2017, 1, 1), - ... datetime(2018, 1, 1), - ... datetime(2019, 1, 1), - ... ], # note record date: Jan 1st (sorted!) - ... "gdp": [4164, 4411, 4566, 4696], - ... } - ... ).set_sorted("date") - >>> population = pl.DataFrame( - ... { - ... "date": [ - ... datetime(2016, 5, 12), - ... datetime(2017, 5, 12), - ... datetime(2018, 5, 12), - ... datetime(2019, 5, 12), - ... ], # note record date: May 12th (sorted!) - ... "population": [82.19, 82.66, 83.12, 83.52], - ... } - ... ).set_sorted("date") - >>> population.join_asof(gdp, on="date", strategy="backward") - shape: (4, 3) - ┌─────────────────────┬────────────┬──────┐ - │ date ┆ population ┆ gdp │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ f64 ┆ i64 │ - ╞═════════════════════╪════════════╪══════╡ - │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ - │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ - │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ - │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ - └─────────────────────┴────────────┴──────┘ - - ''' - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: - ''' - Join in SQL-like fashion. - - Parameters - ---------- - other - DataFrame to join with. - on - Name(s) of the join columns in both DataFrames. - how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} - Join strategy. - - .. note:: - A left join preserves the row order of the left DataFrame. - left_on - Name(s) of the left join column(s). - right_on - Name(s) of the right join column(s). - suffix - Suffix to append to columns with a duplicate name. - validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} - Checks if join is of specified type. - - * *many_to_many* - “m:m”: default, does not result in checks - * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets - * *one_to_many* - “1:m”: check if join keys are unique in left dataset - * *many_to_one* - “m:1”: check if join keys are unique in right dataset - - .. note:: - - - This is currently not supported the streaming engine. - - This is only supported when joined by single columns. - - Returns - ------- - DataFrame - - See Also - -------- - join_asof - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> other_df = pl.DataFrame( - ... { - ... "apple": ["x", "y", "z"], - ... "ham": ["a", "b", "d"], - ... } - ... ) - >>> df.join(other_df, on="ham") - shape: (2, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - └─────┴─────┴─────┴───────┘ - - >>> df.join(other_df, on="ham", how="outer") - shape: (4, 4) - ┌──────┬──────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞══════╪══════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ null ┆ null ┆ d ┆ z │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └──────┴──────┴─────┴───────┘ - - >>> df.join(other_df, on="ham", how="left") - shape: (3, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └─────┴─────┴─────┴───────┘ - - >>> df.join(other_df, on="ham", how="semi") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 7.0 ┆ b │ - └─────┴─────┴─────┘ - - >>> df.join(other_df, on="ham", how="anti") - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - Notes - ----- - For joining on columns with categorical data, see `pl.StringCache()`. - - ''' - def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: - ''' - Apply a custom/user-defined function (UDF) over the rows of the DataFrame. - - .. warning:: - This method is much slower than the native expressions API. - Only use it if you cannot implement your logic otherwise. - - The UDF will receive each row as a tuple of values: `udf(row)`. - - Implementing logic using a Python function is almost always *significantly* - slower and more memory intensive than implementing the same logic using - the native expression API because: - - - The native expression engine runs in Rust; UDFs run in Python. - - Use of Python UDFs forces the DataFrame to be materialized in memory. - - Polars-native expressions can be parallelised (UDFs typically cannot). - - Polars-native expressions can be logically optimised (UDFs cannot). - - Wherever possible you should strongly prefer the native expression API - to achieve the best performance. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output type of the operation. If none given, Polars tries to infer the type. - inference_size - Only used in the case when the custom function returns rows. - This uses the first `n` rows to determine the output schema. - - Notes - ----- - * The frame-level `apply` cannot track column names (as the UDF is a black-box - that may arbitrarily drop, rearrange, transform, or add new columns); if you - want to apply a UDF such that column names are preserved, you should use the - expression-level `apply` syntax instead. - - * If your function is expensive and you don\'t want it to be called more than - once for a given input, consider applying an `@lru_cache` decorator to it. - If your data is suitable you may achieve *significant* speedups. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) - - Return a DataFrame by mapping each row to a tuple: - - >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) - shape: (3, 2) - ┌──────────┬──────────┐ - │ column_0 ┆ column_1 │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════════╪══════════╡ - │ 2 ┆ -3 │ - │ 4 ┆ 15 │ - │ 6 ┆ 24 │ - └──────────┴──────────┘ - - However, it is much better to implement this with a native expression: - - >>> df.select( - ... pl.col("foo") * 2, - ... pl.col("bar") * 3, - ... ) # doctest: +IGNORE_RESULT - - Return a DataFrame with a single column by mapping each row to a scalar: - - >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP - shape: (3, 1) - ┌───────┐ - │ apply │ - │ --- │ - │ i64 │ - ╞═══════╡ - │ 1 │ - │ 9 │ - │ 14 │ - └───────┘ - - In this case it is better to use the following native expression: - - >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT - - ''' - def hstack(self, columns: list[Series] | DataFrame) -> Self: - ''' - Return a new DataFrame grown horizontally by stacking multiple Series to it. - - Parameters - ---------- - columns - Series to stack. - in_place - Modify in place. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> x = pl.Series("apple", [10, 20, 30]) - >>> df.hstack([x]) - shape: (3, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6 ┆ a ┆ 10 │ - │ 2 ┆ 7 ┆ b ┆ 20 │ - │ 3 ┆ 8 ┆ c ┆ 30 │ - └─────┴─────┴─────┴───────┘ - - ''' - def vstack(self, other: DataFrame) -> Self: - ''' - Grow this DataFrame vertically by stacking a DataFrame to it. - - Parameters - ---------- - other - DataFrame to stack. - in_place - Modify in place. - - See Also - -------- - extend - - Examples - -------- - >>> df1 = pl.DataFrame( - ... { - ... "foo": [1, 2], - ... "bar": [6, 7], - ... "ham": ["a", "b"], - ... } - ... ) - >>> df2 = pl.DataFrame( - ... { - ... "foo": [3, 4], - ... "bar": [8, 9], - ... "ham": ["c", "d"], - ... } - ... ) - >>> df1.vstack(df2) - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - │ 4 ┆ 9 ┆ d │ - └─────┴─────┴─────┘ - - ''' - def extend(self, other: DataFrame) -> Self: - ''' - Extend the memory backed by this `DataFrame` with the values from `other`. - - Different from `vstack` which adds the chunks from `other` to the chunks of - this `DataFrame`, `extend` appends the data from `other` to the underlying - memory locations and thus may cause a reallocation. - - If this does not cause a reallocation, the resulting data structure will not - have any extra chunks and thus will yield faster queries. - - Prefer `extend` over `vstack` when you want to do a query after a single - append. For instance, during online operations where you add `n` rows and rerun - a query. - - Prefer `vstack` over `extend` when you want to append many times before - doing a query. For instance, when you read in multiple files and want to store - them in a single `DataFrame`. In the latter case, finish the sequence of - `vstack` operations with a `rechunk`. - - Parameters - ---------- - other - DataFrame to vertically add. - - Warnings - -------- - This method modifies the dataframe in-place. The dataframe is returned for - convenience only. - - See Also - -------- - vstack - - Examples - -------- - >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) - >>> df1.extend(df2) - shape: (6, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 5 │ - │ 3 ┆ 6 │ - │ 10 ┆ 40 │ - │ 20 ┆ 50 │ - │ 30 ┆ 60 │ - └─────┴─────┘ - - ''' - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: - ''' - Remove columns from the dataframe. - - Parameters - ---------- - columns - Names of the columns that should be removed from the dataframe, or - a selector that determines the columns to drop. - *more_columns - Additional columns to drop, specified as positional arguments. - - Examples - -------- - Drop a single column by passing the name of that column. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.drop("ham") - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪═════╡ - │ 1 ┆ 6.0 │ - │ 2 ┆ 7.0 │ - │ 3 ┆ 8.0 │ - └─────┴─────┘ - - Drop multiple columns by passing a list of column names. - - >>> df.drop(["bar", "ham"]) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - Drop multiple columns by passing a selector. - - >>> import polars.selectors as cs - >>> df.drop(cs.numeric()) - shape: (3, 1) - ┌─────┐ - │ ham │ - │ --- │ - │ str │ - ╞═════╡ - │ a │ - │ b │ - │ c │ - └─────┘ - - Use positional arguments to drop multiple columns. - - >>> df.drop("foo", "ham") - shape: (3, 1) - ┌─────┐ - │ bar │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 6.0 │ - │ 7.0 │ - │ 8.0 │ - └─────┘ - - ''' - def drop_in_place(self, name: str) -> Series: - ''' - Drop a single column in-place and return the dropped column. - - Parameters - ---------- - name - Name of the column to drop. - - Returns - ------- - Series - The dropped column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.drop_in_place("ham") - shape: (3,) - Series: \'ham\' [str] - [ - "a" - "b" - "c" - ] - - ''' - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: - ''' - Cast DataFrame column(s) to the specified dtype(s). - - Parameters - ---------- - dtypes - Mapping of column names (or selector) to dtypes, or a single dtype - to which all columns will be cast. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], - ... } - ... ) - - Cast specific frame columns to the specified dtypes: - - >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ u8 ┆ date │ - ╞═════╪═════╪════════════╡ - │ 1.0 ┆ 6 ┆ 2020-01-02 │ - │ 2.0 ┆ 7 ┆ 2021-03-04 │ - │ 3.0 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - Cast all frame columns to the specified dtype: - - >>> df.cast(pl.Utf8).to_dict(as_series=False) - {\'foo\': [\'1\', \'2\', \'3\'], - \'bar\': [\'6.0\', \'7.0\', \'8.0\'], - \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} - - Use selectors to define the columns being cast: - - >>> import polars.selectors as cs - >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ str │ - ╞═════╪═════╪════════════╡ - │ 1 ┆ 6 ┆ 2020-01-02 │ - │ 2 ┆ 7 ┆ 2021-03-04 │ - │ 3 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - ''' - def clear(self, n: int = ...) -> Self: - ''' - Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. - - Returns a `n`-row null-filled DataFrame with an identical schema. - `n` can be greater than the current number of rows in the DataFrame. - - Parameters - ---------- - n - Number of (null-filled) rows to return in the cleared frame. - - See Also - -------- - clone : Cheap deepcopy/clone. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> df.clear() - shape: (0, 3) - ┌─────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞═════╪═════╪══════╡ - └─────┴─────┴──────┘ - - >>> df.clear(n=2) - shape: (2, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪══════╪══════╡ - │ null ┆ null ┆ null │ - │ null ┆ null ┆ null │ - └──────┴──────┴──────┘ - - ''' - def clone(self) -> Self: - ''' - Create a copy of this DataFrame. - - This is a cheap operation that does not copy data. - - See Also - -------- - clear : Create an empty copy of the current DataFrame, with identical - schema but no data. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.clone() - shape: (4, 3) - ┌─────┬──────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true │ - │ 2 ┆ 4.0 ┆ true │ - │ 3 ┆ 10.0 ┆ false │ - │ 4 ┆ 13.0 ┆ true │ - └─────┴──────┴───────┘ - - ''' - def get_columns(self) -> list[Series]: - ''' - Get the DataFrame as a List of Series. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.get_columns() - [shape: (3,) - Series: \'foo\' [i64] - [ - 1 - 2 - 3 - ], shape: (3,) - Series: \'bar\' [i64] - [ - 4 - 5 - 6 - ]] - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.get_columns() - [shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - ], shape: (4,) - Series: \'b\' [f64] - [ - 0.5 - 4.0 - 10.0 - 13.0 - ], shape: (4,) - Series: \'c\' [bool] - [ - true - true - false - true - ]] - - ''' - def get_column(self, name: str) -> Series: - ''' - Get a single column by name. - - Parameters - ---------- - name : str - Name of the column to retrieve. - - Returns - ------- - Series - - See Also - -------- - to_series - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.get_column("foo") - shape: (3,) - Series: \'foo\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: - ''' - Fill null values using the specified value or strategy. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - matches_supertype - Fill all matching supertype of the fill `value`. - - Returns - ------- - DataFrame - DataFrame with None values replaced by the filling strategy. - - See Also - -------- - fill_nan - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 4], - ... "b": [0.5, 4, None, 13], - ... } - ... ) - >>> df.fill_null(99) - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 99 ┆ 99.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - >>> df.fill_null(strategy="forward") - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> df.fill_null(strategy="max") - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> df.fill_null(strategy="zero") - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 0 ┆ 0.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - ''' - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: - ''' - Fill floating point NaN values by an Expression evaluation. - - Parameters - ---------- - value - Value with which to replace NaN values. - - Returns - ------- - DataFrame - DataFrame with NaN values replaced by the given value. - - Warnings - -------- - Note that floating point NaNs (Not a Number) are not missing values! - To replace missing values, use :func:`fill_null`. - - See Also - -------- - fill_null - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1.5, 2, float("nan"), 4], - ... "b": [0.5, 4, float("nan"), 13], - ... } - ... ) - >>> df.fill_nan(99) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪══════╡ - │ 1.5 ┆ 0.5 │ - │ 2.0 ┆ 4.0 │ - │ 99.0 ┆ 99.0 │ - │ 4.0 ┆ 13.0 │ - └──────┴──────┘ - - ''' - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: - ''' - Explode the dataframe to long format by exploding the given columns. - - Parameters - ---------- - columns - Column names, expressions, or a selector defining them. The underlying - columns being exploded must be of List or Utf8 datatype. - *more_columns - Additional names of columns to explode, specified as positional arguments. - - Returns - ------- - DataFrame - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "letters": ["a", "a", "b", "c"], - ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], - ... } - ... ) - >>> df - shape: (4, 2) - ┌─────────┬───────────┐ - │ letters ┆ numbers │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════════╪═══════════╡ - │ a ┆ [1] │ - │ a ┆ [2, 3] │ - │ b ┆ [4, 5] │ - │ c ┆ [6, 7, 8] │ - └─────────┴───────────┘ - >>> df.explode("numbers") - shape: (8, 2) - ┌─────────┬─────────┐ - │ letters ┆ numbers │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════════╪═════════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ a ┆ 3 │ - │ b ┆ 4 │ - │ b ┆ 5 │ - │ c ┆ 6 │ - │ c ┆ 7 │ - │ c ┆ 8 │ - └─────────┴─────────┘ - - ''' - def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: - ''' - Create a spreadsheet-style pivot table as a DataFrame. - - Only available in eager mode. See "Examples" section below for how to do a - "lazy pivot" if you know the unique column values in advance. - - Parameters - ---------- - values - Column values to aggregate. Can be multiple columns if the *columns* - arguments contains multiple columns as well. - index - One or multiple keys to group by. - columns - Name of the column(s) whose values will be used as the header of the output - DataFrame. - aggregate_function - Choose from: - - - None: no aggregation takes place, will raise error if multiple values are in group. - - A predefined aggregate function string, one of - {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} - - An expression to do the aggregation. - - maintain_order - Sort the grouped keys so that the output order is predictable. - sort_columns - Sort the transposed columns by name. Default is by order of discovery. - separator - Used as separator/delimiter in generated column names. - - Returns - ------- - DataFrame - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": ["one", "one", "two", "two", "one", "two"], - ... "bar": ["y", "y", "y", "x", "x", "x"], - ... "baz": [1, 2, 3, 4, 5, 6], - ... } - ... ) - >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ y ┆ x │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ one ┆ 3 ┆ 5 │ - │ two ┆ 3 ┆ 10 │ - └─────┴─────┴─────┘ - - Pivot using selectors to determine the index/values/columns: - - >>> import polars.selectors as cs - >>> df.pivot( - ... values=cs.numeric(), - ... index=cs.string(), - ... columns=cs.string(), - ... aggregate_function="sum", - ... sort_columns=True, - ... ).sort( - ... by=cs.string(), - ... ) - shape: (4, 6) - ┌─────┬─────┬──────┬──────┬──────┬──────┐ - │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪══════╪══════╪══════╪══════╡ - │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ - │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ - │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ - │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ - └─────┴─────┴──────┴──────┴──────┴──────┘ - - Run an expression as aggregation function - - >>> df = pl.DataFrame( - ... { - ... "col1": ["a", "a", "a", "b", "b", "b"], - ... "col2": ["x", "x", "x", "x", "y", "y"], - ... "col3": [6, 7, 3, 2, 5, 7], - ... } - ... ) - >>> df.pivot( - ... index="col1", - ... columns="col2", - ... values="col3", - ... aggregate_function=pl.element().tanh().mean(), - ... ) - shape: (2, 3) - ┌──────┬──────────┬──────────┐ - │ col1 ┆ x ┆ y │ - │ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 │ - ╞══════╪══════════╪══════════╡ - │ a ┆ 0.998347 ┆ null │ - │ b ┆ 0.964028 ┆ 0.999954 │ - └──────┴──────────┴──────────┘ - - Note that `pivot` is only available in eager mode. If you know the unique - column values in advance, you can use :meth:`polars.LazyFrame.groupby` to - get the same result as above in lazy mode: - - >>> index = pl.col("col1") - >>> columns = pl.col("col2") - >>> values = pl.col("col3") - >>> unique_column_values = ["x", "y"] - >>> aggregate_function = lambda col: col.tanh().mean() - >>> ( - ... df.lazy() - ... .group_by(index) - ... .agg( - ... *[ - ... aggregate_function(values.filter(columns == value)).alias(value) - ... for value in unique_column_values - ... ] - ... ) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - shape: (2, 3) - ┌──────┬──────────┬──────────┐ - │ col1 ┆ x ┆ y │ - │ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 │ - ╞══════╪══════════╪══════════╡ - │ a ┆ 0.998347 ┆ null │ - │ b ┆ 0.964028 ┆ 0.999954 │ - └──────┴──────────┴──────────┘ - - ''' - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: - ''' - Unpivot a DataFrame from wide to long format. - - Optionally leaves identifiers set. - - This function is useful to massage a DataFrame into a format where one or more - columns are identifier variables (id_vars) while all other columns, considered - measured variables (value_vars), are "unpivoted" to the row axis leaving just - two non-identifier columns, \'variable\' and \'value\'. - - Parameters - ---------- - id_vars - Column(s) or selector(s) to use as identifier variables. - value_vars - Column(s) or selector(s) to use as values variables; if `value_vars` - is empty all columns that are not in `id_vars` will be used. - variable_name - Name to give to the `variable` column. Defaults to "variable" - value_name - Name to give to the `value` column. Defaults to "value" - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["x", "y", "z"], - ... "b": [1, 3, 5], - ... "c": [2, 4, 6], - ... } - ... ) - >>> import polars.selectors as cs - >>> df.melt(id_vars="a", value_vars=cs.numeric()) - shape: (6, 3) - ┌─────┬──────────┬───────┐ - │ a ┆ variable ┆ value │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 │ - ╞═════╪══════════╪═══════╡ - │ x ┆ b ┆ 1 │ - │ y ┆ b ┆ 3 │ - │ z ┆ b ┆ 5 │ - │ x ┆ c ┆ 2 │ - │ y ┆ c ┆ 4 │ - │ z ┆ c ┆ 6 │ - └─────┴──────────┴───────┘ - - ''' - def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: - ''' - Unstack a long table to a wide form without doing an aggregation. - - This can be much faster than a pivot, because it can skip the grouping phase. - - Warnings - -------- - This functionality is experimental and may be subject to changes - without it being considered a breaking change. - - Parameters - ---------- - step - Number of rows in the unstacked frame. - how : { \'vertical\', \'horizontal\' } - Direction of the unstack. - columns - Column name(s) or selector(s) to include in the operation. - If set to `None` (default), use all columns. - fill_values - Fill values that don\'t fit the new size with this value. - - Examples - -------- - >>> from string import ascii_uppercase - >>> df = pl.DataFrame( - ... { - ... "x": list(ascii_uppercase[0:8]), - ... "y": pl.int_range(1, 9, eager=True), - ... } - ... ).with_columns( - ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), - ... ) - >>> df - shape: (8, 3) - ┌─────┬─────┬──────────┐ - │ x ┆ y ┆ z │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ list[u8] │ - ╞═════╪═════╪══════════╡ - │ A ┆ 1 ┆ [1, 2] │ - │ B ┆ 2 ┆ [2, 3] │ - │ C ┆ 3 ┆ [3, 4] │ - │ D ┆ 4 ┆ [4, 5] │ - │ E ┆ 5 ┆ [5, 6] │ - │ F ┆ 6 ┆ [6, 7] │ - │ G ┆ 7 ┆ [7, 8] │ - │ H ┆ 8 ┆ [8, 9] │ - └─────┴─────┴──────────┘ - >>> df.unstack(step=4, how="vertical") - shape: (4, 6) - ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ - │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ - ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ - │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ - │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ - │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ - │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ - └─────┴─────┴─────┴─────┴──────────┴──────────┘ - >>> df.unstack(step=2, how="horizontal") - shape: (4, 6) - ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ - │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ - ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ - │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ - │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ - │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ - │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ - └─────┴─────┴─────┴─────┴──────────┴──────────┘ - >>> import polars.selectors as cs - >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) - shape: (5, 2) - ┌─────┬─────┐ - │ y_0 ┆ y_1 │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 6 │ - │ 2 ┆ 7 │ - │ 3 ┆ 8 │ - │ 4 ┆ 0 │ - │ 5 ┆ 0 │ - └─────┴─────┘ - - ''' - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: - ''' - Group by the given columns and return the groups as separate dataframes. - - Parameters - ---------- - by - Column name(s) or selector(s) to group by. - *more_by - Additional names of columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default partition by operation. - include_key - Include the columns used to partition the DataFrame in the output. - as_dict - Return a dictionary instead of a list. The dictionary keys are the distinct - group values that identify that group. - - Examples - -------- - Pass a single column name to partition by that column. - - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "c"], - ... "b": [1, 2, 1, 3, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> df.partition_by("a") # doctest: +IGNORE_RESULT - [shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘, - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘] - - Partition by multiple columns by either passing a list of column names, or by - specifying each column name as a positional argument. - - >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT - [shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘] - - Return the partitions as a dictionary by specifying `as_dict=True`. - - >>> import polars.selectors as cs - >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT - {\'a\': shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘, - \'b\': shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘, - \'c\': shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘} - - ''' - def shift(self, n: int = ...) -> DataFrame: - ''' - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. Accepts expression input. - Non-expression inputs are parsed as literals. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [5, 6, 7, 8], - ... } - ... ) - >>> df.shift() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ null ┆ null │ - │ 1 ┆ 5 │ - │ 2 ┆ 6 │ - │ 3 ┆ 7 │ - └──────┴──────┘ - - Pass a negative value to shift in the opposite direction instead. - - >>> df.shift(-2) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ null ┆ null │ - │ null ┆ null │ - └──────┴──────┘ - - Specify `fill_value` to fill the resulting null values. - - >>> df.shift(-2, fill_value=100) - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ 100 ┆ 100 │ - │ 100 ┆ 100 │ - └─────┴─────┘ - - ''' - def is_duplicated(self) -> Series: - ''' - Get a mask of all duplicated rows in this DataFrame. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) - >>> df.is_duplicated() - shape: (4,) - Series: \'\' [bool] - [ - true - false - false - true - ] - - This mask can be used to visualize the duplicated lines like this: - - >>> df.filter(df.is_duplicated()) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ str │ - ╞═════╪═════╡ - │ 1 ┆ x │ - │ 1 ┆ x │ - └─────┴─────┘ - ''' - def is_unique(self) -> Series: - ''' - Get a mask of all unique rows in this DataFrame. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) - >>> df.is_unique() - shape: (4,) - Series: \'\' [bool] - [ - false - true - true - false - ] - - This mask can be used to visualize the unique lines like this: - - >>> df.filter(df.is_unique()) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ str │ - ╞═════╪═════╡ - │ 2 ┆ y │ - │ 3 ┆ z │ - └─────┴─────┘ - ''' - def lazy(self) -> LazyFrame: - ''' - Start a lazy query from this point. This returns a `LazyFrame` object. - - Operations on a `LazyFrame` are not executed until this is requested by either - calling: - - * :meth:`.fetch() ` - (run on a small number of rows) - * :meth:`.collect() ` - (run on all data) - * :meth:`.describe_plan() ` - (print unoptimized query plan) - * :meth:`.describe_optimized_plan() ` - (print optimized query plan) - * :meth:`.show_graph() ` - (show (un)optimized query plan as graphviz graph) - - Lazy operations are advised because they allow for query optimization and more - parallelization. - - Returns - ------- - LazyFrame - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> df.lazy() # doctest: +ELLIPSIS - - - ''' - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - ''' - Select columns from this DataFrame. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Examples - -------- - Pass the name of a column to select that column. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.select("foo") - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - Multiple columns can be selected by passing a list of column names. - - >>> df.select(["foo", "bar"]) - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 6 │ - │ 2 ┆ 7 │ - │ 3 ┆ 8 │ - └─────┴─────┘ - - Multiple columns can also be selected using positional arguments instead of a - list. Expressions are also accepted. - - >>> df.select(pl.col("foo"), pl.col("bar") + 1) - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - └─────┴─────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) - shape: (3, 1) - ┌───────────┐ - │ threshold │ - │ --- │ - │ i32 │ - ╞═══════════╡ - │ 0 │ - │ 0 │ - │ 10 │ - └───────────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... df.select( - ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), - ... ) - ... - shape: (3, 1) - ┌───────────┐ - │ is_odd │ - │ --- │ - │ struct[2] │ - ╞═══════════╡ - │ {1,0} │ - │ {0,1} │ - │ {1,0} │ - └───────────┘ - - ''' - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - """ - Select columns from this LazyFrame. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - See Also - -------- - select - - """ - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - ''' - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - DataFrame - A new DataFrame with the columns added. - - Notes - ----- - Creating a new DataFrame using this method does not create a new copy of - existing data. - - Examples - -------- - Pass an expression to add it as a new column. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) - shape: (4, 4) - ┌─────┬──────┬───────┬──────┐ - │ a ┆ b ┆ c ┆ a^2 │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 │ - ╞═════╪══════╪═══════╪══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ - └─────┴──────┴───────┴──────┘ - - Added columns will replace existing columns with the same name. - - >>> df.with_columns(pl.col("a").cast(pl.Float64)) - shape: (4, 3) - ┌─────┬──────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╡ - │ 1.0 ┆ 0.5 ┆ true │ - │ 2.0 ┆ 4.0 ┆ true │ - │ 3.0 ┆ 10.0 ┆ false │ - │ 4.0 ┆ 13.0 ┆ true │ - └─────┴──────┴───────┘ - - Multiple columns can be added by passing a list of expressions. - - >>> df.with_columns( - ... [ - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ] - ... ) - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Multiple columns also can be added using positional arguments instead of a list. - - >>> df.with_columns( - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ) - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> df.with_columns( - ... ab=pl.col("a") * pl.col("b"), - ... not_c=pl.col("c").not_(), - ... ) - shape: (4, 5) - ┌─────┬──────┬───────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ ab ┆ not_c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ - └─────┴──────┴───────┴──────┴───────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... df.drop("c").with_columns( - ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), - ... ) - ... - shape: (4, 3) - ┌─────┬──────┬─────────────┐ - │ a ┆ b ┆ diffs │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ struct[2] │ - ╞═════╪══════╪═════════════╡ - │ 1 ┆ 0.5 ┆ {null,null} │ - │ 2 ┆ 4.0 ┆ {1,3.5} │ - │ 3 ┆ 10.0 ┆ {1,6.0} │ - │ 4 ┆ 13.0 ┆ {1,3.0} │ - └─────┴──────┴─────────────┘ - - ''' - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - """ - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - LazyFrame - A new LazyFrame with the columns added. - - See Also - -------- - with_columns - - """ - def n_chunks(self, strategy: str = ...) -> int | list[int]: - ''' - Get number of chunks used by the ChunkedArrays of this DataFrame. - - Parameters - ---------- - strategy : {\'first\', \'all\'} - Return the number of chunks of the \'first\' column, - or \'all\' columns in this DataFrame. - - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.n_chunks() - 1 - >>> df.n_chunks(strategy="all") - [1, 1, 1] - - ''' - def max(self, axis: int | None = ...) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their maximum value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`max_horizontal`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.max() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def max_horizontal(self) -> Series: - ''' - Get the maximum value horizontally across columns. - - Returns - ------- - Series - A Series named `"max"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.max_horizontal() - shape: (3,) - Series: \'max\' [f64] - [ - 4.0 - 5.0 - 6.0 - ] - ''' - def min(self, axis: int | None = ...) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their minimum value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`min_horizontal`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.min() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - ''' - def min_horizontal(self) -> Series: - ''' - Get the minimum value horizontally across columns. - - Returns - ------- - Series - A Series named `"min"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.min_horizontal() - shape: (3,) - Series: \'min\' [f64] - [ - 1.0 - 2.0 - 3.0 - ] - ''' - def sum(self) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their sum value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`sum_horizontal`. - null_strategy : {\'ignore\', \'propagate\'} - This argument is only used if `axis == 1`. - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.sum() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 6 ┆ 21 ┆ null │ - └─────┴─────┴──────┘ - ''' - def sum_horizontal(self) -> Series: - ''' - Sum all values horizontally across columns. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - If set to `False`, any null value in the input will lead to a null output. - - Returns - ------- - Series - A Series named `"sum"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.sum_horizontal() - shape: (3,) - Series: \'sum\' [f64] - [ - 5.0 - 7.0 - 9.0 - ] - ''' - def mean(self) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their mean value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`mean_horizontal`. - null_strategy : {\'ignore\', \'propagate\'} - This argument is only used if `axis == 1`. - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... "spam": [True, False, None], - ... } - ... ) - >>> df.mean() - shape: (1, 4) - ┌─────┬─────┬──────┬──────┐ - │ foo ┆ bar ┆ ham ┆ spam │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str ┆ f64 │ - ╞═════╪═════╪══════╪══════╡ - │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ - └─────┴─────┴──────┴──────┘ - ''' - def mean_horizontal(self) -> Series: - ''' - Take the mean of all values horizontally across columns. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - If set to `False`, any null value in the input will lead to a null output. - - Returns - ------- - Series - A Series named `"mean"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.mean_horizontal() - shape: (3,) - Series: \'mean\' [f64] - [ - 2.5 - 3.5 - 4.5 - ] - ''' - def std(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns of this DataFrame to their standard deviation value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.std() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1.0 ┆ 1.0 ┆ null │ - └─────┴─────┴──────┘ - >>> df.std(ddof=0) - shape: (1, 3) - ┌──────────┬──────────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞══════════╪══════════╪══════╡ - │ 0.816497 ┆ 0.816497 ┆ null │ - └──────────┴──────────┴──────┘ - - ''' - def var(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns of this DataFrame to their variance value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.var() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1.0 ┆ 1.0 ┆ null │ - └─────┴─────┴──────┘ - >>> df.var(ddof=0) - shape: (1, 3) - ┌──────────┬──────────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞══════════╪══════════╪══════╡ - │ 0.666667 ┆ 0.666667 ┆ null │ - └──────────┴──────────┴──────┘ - - ''' - def median(self) -> Self: - ''' - Aggregate the columns of this DataFrame to their median value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.median() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 2.0 ┆ 7.0 ┆ null │ - └─────┴─────┴──────┘ - - ''' - def product(self) -> DataFrame: - ''' - Aggregate the columns of this DataFrame to their product values. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": [0.5, 4, 10], - ... "c": [True, True, False], - ... } - ... ) - - >>> df.product() - shape: (1, 3) - ┌─────┬──────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ i64 │ - ╞═════╪══════╪═════╡ - │ 6 ┆ 20.0 ┆ 0 │ - └─────┴──────┴─────┘ - - ''' - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: - ''' - Aggregate the columns of this DataFrame to their quantile value. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.quantile(0.5, "nearest") - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 2.0 ┆ 7.0 ┆ null │ - └─────┴─────┴──────┘ - - ''' - def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: - ''' - Convert categorical variables into dummy/indicator variables. - - Parameters - ---------- - columns - Column name(s) or selector(s) that should be converted to dummy - variables. If set to `None` (default), convert all columns. - separator - Separator/delimiter used when generating column names. - drop_first - Remove the first category from the variables being encoded. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2], - ... "bar": [3, 4], - ... "ham": ["a", "b"], - ... } - ... ) - >>> df.to_dummies() - shape: (2, 6) - ┌───────┬───────┬───────┬───────┬───────┬───────┐ - │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ - ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ - │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ - │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ - └───────┴───────┴───────┴───────┴───────┴───────┘ - - >>> df.to_dummies(drop_first=True) - shape: (2, 3) - ┌───────┬───────┬───────┐ - │ foo_2 ┆ bar_4 ┆ ham_b │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 │ - ╞═══════╪═══════╪═══════╡ - │ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1 ┆ 1 │ - └───────┴───────┴───────┘ - - >>> import polars.selectors as cs - >>> df.to_dummies(cs.integer(), separator=":") - shape: (2, 5) - ┌───────┬───────┬───────┬───────┬─────┐ - │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ - ╞═══════╪═══════╪═══════╪═══════╪═════╡ - │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ - │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ - └───────┴───────┴───────┴───────┴─────┘ - - >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") - shape: (2, 3) - ┌───────┬───────┬─────┐ - │ foo:2 ┆ bar:4 ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ str │ - ╞═══════╪═══════╪═════╡ - │ 0 ┆ 0 ┆ a │ - │ 1 ┆ 1 ┆ b │ - └───────┴───────┴─────┘ - - ''' - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: - ''' - Drop duplicate rows from this dataframe. - - Parameters - ---------- - subset - Column name(s) or selector(s), to consider when identifying - duplicate rows. If set to `None` (default), use all columns. - keep : {\'first\', \'last\', \'any\', \'none\'} - Which of the duplicate rows to keep. - - * \'any\': Does not give any guarantee of which row is kept. - This allows more optimizations. - * \'none\': Don\'t keep duplicate rows. - * \'first\': Keep first unique row. - * \'last\': Keep last unique row. - maintain_order - Keep the same order as the original DataFrame. This is more expensive to - compute. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - Returns - ------- - DataFrame - DataFrame with unique rows. - - Warnings - -------- - This method will fail if there is a column of type `List` in the DataFrame or - subset. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 1], - ... "bar": ["a", "a", "a", "a"], - ... "ham": ["b", "b", "b", "b"], - ... } - ... ) - >>> df.unique(maintain_order=True) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> df.unique(subset=["bar", "ham"], maintain_order=True) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> df.unique(keep="last", maintain_order=True) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - - ''' - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: - ''' - Return the number of unique rows, or the number of unique row-subsets. - - Parameters - ---------- - subset - One or more columns/expressions that define what to count; - omit to return the count of unique rows. - - Notes - ----- - This method operates at the `DataFrame` level; to operate on subsets at the - expression level you can make use of struct-packing instead, for example: - - >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() - - If instead you want to count the number of unique values per-column, you can - also use expression-level syntax to return a new frame containing that result: - - >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) - >>> df_nunique = df.select(pl.all().n_unique()) - - In aggregate context there is also an equivalent method for returning the - unique values per-group: - - >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 1, 2, 3, 4, 5], - ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], - ... "c": [True, True, True, False, True, True], - ... } - ... ) - >>> df.n_unique() - 5 - - Simple columns subset. - - >>> df.n_unique(subset=["b", "c"]) - 4 - - Expression subset. - - >>> df.n_unique( - ... subset=[ - ... (pl.col("a") // 2), - ... (pl.col("c") | (pl.col("b") >= 2)), - ... ], - ... ) - 3 - - ''' - def approx_n_unique(self) -> DataFrame: - ''' - Approximate count of unique values. - - This is done using the HyperLogLog++ algorithm for cardinality estimation. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> df.approx_n_unique() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def approx_unique(self) -> DataFrame: - """ - Approximate count of unique values. - - .. deprecated:: 0.18.12 - This method has been renamed to :func:`DataFrame.approx_n_unique`. - - """ - def rechunk(self) -> Self: - """ - Rechunk the data in this DataFrame to a contiguous allocation. - - This will make sure all subsequent operations have optimal and predictable - performance. - """ - def null_count(self) -> Self: - ''' - Create a new DataFrame that shows the null counts per column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, None, 3], - ... "bar": [6, 7, None], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.null_count() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ u32 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 1 ┆ 0 │ - └─────┴─────┴─────┘ - - ''' - def sample(self, n: int | Series | None = ...) -> Self: - ''' - Sample from this DataFrame. - - Parameters - ---------- - n - Number of items to return. Cannot be used with `fraction`. Defaults to 1 if - `fraction` is None. - fraction - Fraction of items to return. Cannot be used with `n`. - with_replacement - Allow values to be sampled more than once. - shuffle - If set to True, the order of the sampled rows will be shuffled. If - set to False (default), the order of the returned rows will be - neither stable nor fully random. - seed - Seed for the random number generator. If set to None (default), a - random seed is generated for each sample operation. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8 ┆ c │ - │ 2 ┆ 7 ┆ b │ - └─────┴─────┴─────┘ - - ''' - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: - ''' - Apply a horizontal reduction on a DataFrame. - - This can be used to effectively determine aggregations on a row level, and can - be applied to any DataType that can be supercasted (casted to a similar parent - type). - - An example of the supercast rules when applying an arithmetic operation on two - DataTypes are for instance: - - - Int8 + Utf8 = Utf8 - - Float32 + Int64 = Float32 - - Float32 + Float64 = Float64 - - Examples - -------- - A horizontal sum operation: - - >>> df = pl.DataFrame( - ... { - ... "a": [2, 1, 3], - ... "b": [1, 2, 3], - ... "c": [1.0, 2.0, 3.0], - ... } - ... ) - >>> df.fold(lambda s1, s2: s1 + s2) - shape: (3,) - Series: \'a\' [f64] - [ - 4.0 - 5.0 - 9.0 - ] - - A horizontal minimum operation: - - >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) - >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) - shape: (3,) - Series: \'a\' [f64] - [ - 1.0 - 1.0 - 3.0 - ] - - A horizontal string concatenation: - - >>> df = pl.DataFrame( - ... { - ... "a": ["foo", "bar", 2], - ... "b": [1, 2, 3], - ... "c": [1.0, 2.0, 3.0], - ... } - ... ) - >>> df.fold(lambda s1, s2: s1 + s2) - shape: (3,) - Series: \'a\' [str] - [ - "foo11.0" - "bar22.0" - null - ] - - A horizontal boolean or, similar to a row-wise .any(): - - >>> df = pl.DataFrame( - ... { - ... "a": [False, False, True], - ... "b": [False, True, False], - ... } - ... ) - >>> df.fold(lambda s1, s2: s1 | s2) - shape: (3,) - Series: \'a\' [bool] - [ - false - true - true - ] - - Parameters - ---------- - operation - function that takes two `Series` and returns a `Series`. - - ''' - def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: - ''' - Get the values of a single row, either by index or by predicate. - - Parameters - ---------- - index - Row index. - by_predicate - Select the row according to a given expression/predicate. - named - Return a dictionary instead of a tuple. The dictionary is a mapping of - column name to row value. This is more expensive than returning a regular - tuple, but allows for accessing values by column name. - - Returns - ------- - tuple (default) or dictionary of row values - - Notes - ----- - The `index` and `by_predicate` params are mutually exclusive. Additionally, - to ensure clarity, the `by_predicate` parameter must be supplied by keyword. - - When using `by_predicate` it is an error condition if anything other than - one row is returned; more than one row raises `TooManyRowsReturnedError`, and - zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). - - Warnings - -------- - You should NEVER use this method to iterate over a DataFrame; if you require - row-iteration you should strongly prefer use of `iter_rows()` instead. - - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - rows : Materialise all frame data as a list of rows (potentially expensive). - item: Return dataframe element as a scalar. - - Examples - -------- - Specify an index to return the row at the given index as a tuple. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.row(2) - (3, 8, \'c\') - - Specify `named=True` to get a dictionary instead with a mapping of column - names to row values. - - >>> df.row(2, named=True) - {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} - - Use `by_predicate` to return the row that matches the given predicate. - - >>> df.row(by_predicate=(pl.col("ham") == "b")) - (2, 7, \'b\') - - ''' - def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: - ''' - Returns all data in the DataFrame as a list of rows of python-native values. - - Parameters - ---------- - named - Return dictionaries instead of tuples. The dictionaries are a mapping of - column name to row value. This is more expensive than returning a regular - tuple, but allows for accessing values by column name. - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - Warnings - -------- - Row-iteration is not optimal as the underlying data is stored in columnar form; - where possible, prefer export via one of the dedicated export/output methods. - Where possible you should also consider using `iter_rows` instead to avoid - materialising all the data at once. - - Returns - ------- - list of tuples (default) or dictionaries of row values - - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - rows_by_key : Materialises frame data as a key-indexed dictionary. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "x": ["a", "b", "b", "a"], - ... "y": [1, 2, 3, 4], - ... "z": [0, 3, 6, 9], - ... } - ... ) - >>> df.rows() - [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] - >>> df.rows(named=True) - [{\'x\': \'a\', \'y\': 1, \'z\': 0}, - {\'x\': \'b\', \'y\': 2, \'z\': 3}, - {\'x\': \'b\', \'y\': 3, \'z\': 6}, - {\'x\': \'a\', \'y\': 4, \'z\': 9}] - - ''' - def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: - ''' - Returns DataFrame data as a keyed dictionary of python-native values. - - Note that this method should not be used in place of native operations, due to - the high cost of materialising all frame data out into a dictionary; it should - be used only when you need to move the values out into a Python data structure - or other object that cannot operate directly with Polars/Arrow. - - Parameters - ---------- - key - The column(s) to use as the key for the returned dictionary. If multiple - columns are specified, the key will be a tuple of those values, otherwise - it will be a string. - named - Return dictionary rows instead of tuples, mapping column name to row value. - include_key - Include key values inline with the associated data (by default the key - values are omitted as a memory/performance optimisation, as they can be - reoconstructed from the key). - unique - Indicate that the key is unique; this will result in a 1:1 mapping from - key to a single associated row. Note that if the key is *not* actually - unique the last row with the given key will be returned. - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - See Also - -------- - rows : Materialise all frame data as a list of rows (potentially expensive). - iter_rows : Row iterator over frame data (does not materialise all rows). - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "w": ["a", "b", "b", "a"], - ... "x": ["q", "q", "q", "k"], - ... "y": [1.0, 2.5, 3.0, 4.5], - ... "z": [9, 8, 7, 6], - ... } - ... ) - - Group rows by the given key column(s): - - >>> df.rows_by_key(key=["w"]) - defaultdict(, - {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], - \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) - - Return the same row groupings as dictionaries: - - >>> df.rows_by_key(key=["w"], named=True) - defaultdict(, - {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, - {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], - \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, - {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) - - Return row groupings, assuming keys are unique: - - >>> df.rows_by_key(key=["z"], unique=True) - {9: (\'a\', \'q\', 1.0), - 8: (\'b\', \'q\', 2.5), - 7: (\'b\', \'q\', 3.0), - 6: (\'a\', \'k\', 4.5)} - - Return row groupings as dictionaries, assuming keys are unique: - - >>> df.rows_by_key(key=["z"], named=True, unique=True) - {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, - 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, - 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, - 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} - - Return dictionary rows grouped by a compound key, including key values: - - >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) - defaultdict(, - {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], - (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, - {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], - (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) - - ''' - def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: - ''' - Returns an iterator over the DataFrame of rows of python-native values. - - Parameters - ---------- - named - Return dictionaries instead of tuples. The dictionaries are a mapping of - column name to row value. This is more expensive than returning a regular - tuple, but allows for accessing values by column name. - buffer_size - Determines the number of rows that are buffered internally while iterating - over the data; you should only modify this in very specific cases where the - default value is determined not to be a good fit to your access pattern, as - the speedup from using the buffer is significant (~2-4x). Setting this - value to zero disables row buffering (not recommended). - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - Warnings - -------- - Row iteration is not optimal as the underlying data is stored in columnar form; - where possible, prefer export via one of the dedicated export/output methods - that deals with columnar data. - - Returns - ------- - iterator of tuples (default) or dictionaries (if named) of python row values - - See Also - -------- - rows : Materialises all frame data as a list of rows (potentially expensive). - rows_by_key : Materialises frame data as a key-indexed dictionary. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> [row[0] for row in df.iter_rows()] - [1, 3, 5] - >>> [row["b"] for row in df.iter_rows(named=True)] - [2, 4, 6] - - ''' - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: - ''' - Returns a non-copying iterator of slices over the underlying DataFrame. - - Parameters - ---------- - n_rows - Determines the number of rows contained in each DataFrame slice. - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... data={ - ... "a": range(17_500), - ... "b": date(2023, 1, 1), - ... "c": "klmnoopqrstuvwxyz", - ... }, - ... schema_overrides={"a": pl.Int32}, - ... ) - >>> for idx, frame in enumerate(df.iter_slices()): - ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") - ... - DataFrame:[0]:10000 - DataFrame:[1]:7500 - - Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and - any supported frame export/conversion types; for example, as RecordBatches: - - >>> for frame in df.iter_slices(n_rows=15_000): - ... record_batch = frame.to_arrow().to_batches()[0] - ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") - ... - a: int32 - b: date32[day] - c: large_string - << 15000 - a: int32 - b: date32[day] - c: large_string - << 2500 - - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - partition_by : Split into multiple DataFrames, partitioned by groups. - - ''' - def shrink_to_fit(self) -> Self: - """ - Shrink DataFrame memory usage. - - Shrinks to fit the exact capacity needed to hold the data. - - """ - def gather_every(self, n: int) -> DataFrame: - ''' - Take every nth row in the DataFrame and return as a new DataFrame. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) - >>> s.gather_every(2) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 5 │ - │ 3 ┆ 7 │ - └─────┴─────┘ - - ''' - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: - ''' - Hash and combine the rows in this DataFrame. - - The hash value is of type `UInt64`. - - Parameters - ---------- - seed - Random seed parameter. Defaults to 0. - seed_1 - Random seed parameter. Defaults to `seed` if not set. - seed_2 - Random seed parameter. Defaults to `seed` if not set. - seed_3 - Random seed parameter. Defaults to `seed` if not set. - - Notes - ----- - This implementation of :func:`hash_rows` does not guarantee stable results - across different Polars versions. Its stability is only guaranteed within a - single version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, None, 3, 4], - ... "ham": ["a", "b", None, "d"], - ... } - ... ) - >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT - shape: (4,) - Series: \'\' [u64] - [ - 10783150408545073287 - 1438741209321515184 - 10047419486152048166 - 2047317070637311557 - ] - - ''' - def interpolate(self) -> DataFrame: - ''' - Interpolate intermediate values. The interpolation method is linear. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, None, 9, 10], - ... "bar": [6, 7, 9, None], - ... "baz": [1, None, None, 9], - ... } - ... ) - >>> df.interpolate() - shape: (4, 3) - ┌──────┬──────┬──────────┐ - │ foo ┆ bar ┆ baz │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 │ - ╞══════╪══════╪══════════╡ - │ 1.0 ┆ 6.0 ┆ 1.0 │ - │ 5.0 ┆ 7.0 ┆ 3.666667 │ - │ 9.0 ┆ 9.0 ┆ 6.333333 │ - │ 10.0 ┆ null ┆ 9.0 │ - └──────┴──────┴──────────┘ - - ''' - def is_empty(self) -> bool: - ''' - Check if the dataframe is empty. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.is_empty() - False - >>> df.filter(pl.col("foo") > 99).is_empty() - True - - ''' - def to_struct(self, name: str) -> Series: - ''' - Convert a `DataFrame` to a `Series` of type `Struct`. - - Parameters - ---------- - name - Name for the struct Series - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4, 5], - ... "b": ["one", "two", "three", "four", "five"], - ... } - ... ) - >>> df.to_struct("nums") - shape: (5,) - Series: \'nums\' [struct[2]] - [ - {1,"one"} - {2,"two"} - {3,"three"} - {4,"four"} - {5,"five"} - ] - - ''' - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: - ''' - Decompose struct columns into separate columns for each of their fields. - - The new columns will be inserted into the dataframe at the location of the - struct column. - - Parameters - ---------- - columns - Name of the struct column(s) that should be unnested. - *more_columns - Additional columns to unnest, specified as positional arguments. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "before": ["foo", "bar"], - ... "t_a": [1, 2], - ... "t_b": ["a", "b"], - ... "t_c": [True, None], - ... "t_d": [[1, 2], [3]], - ... "after": ["baz", "womp"], - ... } - ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") - >>> df - shape: (2, 3) - ┌────────┬─────────────────────┬───────┐ - │ before ┆ t_struct ┆ after │ - │ --- ┆ --- ┆ --- │ - │ str ┆ struct[4] ┆ str │ - ╞════════╪═════════════════════╪═══════╡ - │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ - │ bar ┆ {2,"b",null,[3]} ┆ womp │ - └────────┴─────────────────────┴───────┘ - >>> df.unnest("t_struct") - shape: (2, 6) - ┌────────┬─────┬─────┬──────┬───────────┬───────┐ - │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ - ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ - │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ - │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ - └────────┴─────┴─────┴──────┴───────────┴───────┘ - - ''' - def corr(self, **kwargs: Any) -> DataFrame: - ''' - Return pairwise Pearson product-moment correlation coefficients between columns. - - See numpy `corrcoef` for more information: - https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html - - Notes - ----- - This functionality requires numpy to be installed. - - Parameters - ---------- - **kwargs - Keyword arguments are passed to numpy `corrcoef`. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) - >>> df.corr() - shape: (3, 3) - ┌──────┬──────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 │ - ╞══════╪══════╪══════╡ - │ 1.0 ┆ -1.0 ┆ 1.0 │ - │ -1.0 ┆ 1.0 ┆ -1.0 │ - │ 1.0 ┆ -1.0 ┆ 1.0 │ - └──────┴──────┴──────┘ - - ''' - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: - ''' - Take two sorted DataFrames and merge them by the sorted key. - - The output of this operation will also be sorted. - It is the callers responsibility that the frames are sorted - by that key otherwise the output will not make sense. - - The schemas of both DataFrames must be equal. - - Parameters - ---------- - other - Other DataFrame that must be merged - key - Key that is sorted. - - Examples - -------- - >>> df0 = pl.DataFrame( - ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} - ... ).sort("age") - >>> df0 - shape: (3, 2) - ┌───────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═══════╪═════╡ - │ bob ┆ 18 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └───────┴─────┘ - >>> df1 = pl.DataFrame( - ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} - ... ).sort("age") - >>> df1 - shape: (4, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - └────────┴─────┘ - >>> df0.merge_sorted(df1, key="age") - shape: (7, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ bob ┆ 18 │ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └────────┴─────┘ - ''' - def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: - """ - Indicate that one or multiple columns are sorted. - - Parameters - ---------- - column - Columns that are sorted - more_columns - Additional columns that are sorted, specified as positional arguments. - descending - Whether the columns are sorted in descending order. - """ - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: - ''' - Update the values in this `DataFrame` with the values in `other`. - - By default, null values in the right dataframe are ignored. Use - `ignore_nulls=False` to overwrite values in this frame with null values in other - frame. - - Notes - ----- - This is syntactic sugar for a left/inner join, with an optional coalesce when - `include_nulls = False`. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Parameters - ---------- - other - DataFrame that will be used to update the values - on - Column names that will be joined on. - If none given the row count is used. - left_on - Join column(s) of the left DataFrame. - right_on - Join column(s) of the right DataFrame. - how : {\'left\', \'inner\', \'outer\'} - * \'left\' will keep all rows from the left table; rows may be duplicated - if multiple rows in the right frame match the left row\'s key. - * \'inner\' keeps only those rows where the key exists in both frames. - * \'outer\' will update existing rows where the key matches while also - adding any new rows contained in the given frame. - include_nulls - If True, null values from the right dataframe will be used to update the - left dataframe. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4], - ... "B": [400, 500, 600, 700], - ... } - ... ) - >>> df - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 400 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - >>> new_df = pl.DataFrame( - ... { - ... "B": [-66, None, -99], - ... "C": [5, 3, 1], - ... } - ... ) - - Update `df` values with the non-null values in `new_df`, by row index: - - >>> df.update(new_df) - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, by row index, - but only keeping those rows that are common to both frames: - - >>> df.update(new_df, how="inner") - shape: (3, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") - shape: (5, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴─────┘ - - Update `df` values including null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> df.update( - ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True - ... ) - shape: (5, 2) - ┌─────┬──────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ null │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴──────┘ - - ''' - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: - """ - Start a group by operation. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.group_by`. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - .. note:: - Within each group, the order of rows is always preserved, regardless - of this argument. - - Returns - ------- - GroupBy - Object which can be used to perform aggregations. - - """ - def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - """ - def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.9 - This method has been renamed to :func:`DataFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - """ - def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.group_by_dynamic`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - include_boundaries - Add the lower and upper bound of the window to the "_lower_bound" and - "_upper_bound" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - DynamicGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - ''' - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: - """ - Apply a custom/user-defined function (UDF) over the rows of the DataFrame. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.map_rows`. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output type of the operation. If none given, Polars tries to infer the type. - inference_size - Only used in the case when the custom function returns rows. - This uses the first `n` rows to determine the output schema - - """ - def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - fill None values with this value. - n - Number of places to shift (may be negative). - - """ - def take_every(self, n: int) -> DataFrame: - """ - Take every nth row in the DataFrame and return as a new DataFrame. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - def find_idx_by_name(self, name: str) -> int: - """ - Find the index of a column by name. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`get_column_index`. - - Parameters - ---------- - name - Name of the column to find. - """ - def insert_at_idx(self, index: int, column: Series) -> Self: - """ - Insert a Series at a certain column index. This operation is in place. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`insert_column`. - - Parameters - ---------- - index - Column to insert the new `Series` column. - column - `Series` to insert. - """ - def replace_at_idx(self, index: int, new_column: Series) -> Self: - """ - Replace a column at an index location. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`replace_column`. - - Parameters - ---------- - index - Column index. - new_column - Series that will replace the column. - """ - def frame_equal(self, other: DataFrame) -> bool: - """ - Check whether the DataFrame is equal to another DataFrame. - - .. deprecated:: 0.19.16 - This method has been renamed to :func:`equals`. - - Parameters - ---------- - other - DataFrame to compare with. - null_equal - Consider null values as equal. - """ - @property - def shape(self): ... - @property - def height(self): ... - @property - def width(self): ... - @property - def dtypes(self): ... - @property - def flags(self): ... - @property - def schema(self): ... -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/dataframe/frame.pyi new file mode 100644 index 0000000..21a96e5 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/dataframe/frame.pyi @@ -0,0 +1,7092 @@ +#: version 0.20.1 +import P +import deltalake +import deltalake.table +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Enum as Enum, Float64 as Float64, Null as Null, Object as Object, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import ModuleUpgradeRequired as ModuleUpgradeRequired, NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, _warn_null_comparison as _warn_null_comparison, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PANDAS_AVAILABLE: bool +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use `pl.read_csv` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use `pl.read_parquet` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading `n_rows`. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use `pl.read_json` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use `pl.read_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with `NaN`. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to `True` will raise a `NotImplementedError`. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars DataFrame to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the DataFrame as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to `df[0,0]`, with a check that + the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are Series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + `structured` is set to `False` and the DataFrame dtypes allow for a + global dtype for all columns. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + function for the conversion to numpy if necessary. + + Notes + ----- + If you\'re attempting to convert Utf8 or Decimal to an array, you\'ll need to + install `pyarrow`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Utf8), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Utf8), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + separator or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path or writeable file-like object to which the data will be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + name + Schema name. Defaults to empty string. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open `xlsxwriter.Workbook` object that has not been closed. + If None, writes to a `dataframe.xlsx` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of `{"key":value,}` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. + column_formats : dict + A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. + dtype_formats : dict + A `{dtype:str,}` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + `column_formats` param). It is also valid to use dtype groups such as + `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid `xlsxwriter` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all `xlsxwriter` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A `{key:value,}` dictionary of `xlsxwriter` format options to apply + to the table header row, such as `{"bold":True, "font_color":"#702963"}`. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a `{colname:funcname,}` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A `{colname:int,}` or `{selector:int,}` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a `{colname:columns,}` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or `{row_index:int,}` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that `row_index` starts at zero and will be + the header row (unless `include_header` is False). + sparklines : dict + A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an `xlsxwriter`-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + include_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible `xlsxwriter` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic DataFrame: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC data will be + written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC record batch data will + be written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + data_page_size + Size of the data page in bytes. Defaults to 1024^2 bytes. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to `pyarrow.parquet.write_table`. + + If you pass `partition_cols` here, the dataset will be written + using `pyarrow.parquet.write_to_dataset`. + The `partition_cols` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, table_name: str, connection: str) -> int: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Schema-qualified name of the table to create or append to in the target + SQL database. If your table name contains special characters, it should + be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_table_exists : {\'append\', \'replace\', \'fail\'} + The insert mode: + + * \'replace\' will create a new database table, overwriting an existing one. + * \'append\' will append to an existing table. + * \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine to use for writing frame data. + + Returns + ------- + int + The number of rows affected, if the driver provides this information. + Otherwise, returns -1. + + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> deltalake.table.TableMerger | None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\', \'merge\'} + How to handle existing data. + + - If \'error\', throw an error if the table already exists (default). + - If \'append\', will add new data. + - If \'overwrite\', will replace table with new data. + - If \'ignore\', will not write anything if table already exists. + - If \'merge\', return a `TableMerger` object to merge data from the DataFrame + with the existing data. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + - See a list of supported storage options for S3 `here `__. + - See a list of supported storage options for GCS `here `__. + - See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + delta_merge_options + Keyword arguments which are required to `MERGE` a Delta lake Table. + See a list of supported merge options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + TableNotFoundError + If the delta table doesn\'t exist and MERGE action is triggered + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a DataFrame as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + Merge the DataFrame with an existing Delta Lake table. + For all `TableMerger` methods, check the deltalake docs + `here `__. + + Schema evolution is not yet supported in by the `deltalake` package, therefore + `overwrite_schema` will not have any effect on a merge operation. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> ( + ... df.write_delta( + ... "table_path", + ... mode="merge", + ... delta_merge_options={ + ... "predicate": "s.foo = t.foo", + ... "source_alias": "s", + ... "target_alias": "t", + ... }, + ... ) + ... .when_matched_update_all() + ... .when_not_matched_insert_all() + ... .execute() + ... ) # doctest: +SKIP + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_column(self, index: int, column: Series) -> Self: + ''' + Insert a Series at a certain column index. + + This operation is in place. + + Parameters + ---------- + index + Index at which to insert the new `Series` column. + column + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_column(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_column(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") > 1) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions, combined with and/or operators: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> df.filter( + ... pl.col("foo") <= 2, + ... ~pl.col("ham").is_in(["b", "c"]), + ... ) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> df.filter(foo=2, ham="b") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Warnings + -------- + We will never guarantee the output of describe to be stable. + It will show statistics that we deem informative and may + be updated in the future. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "float": [1.0, 2.8, 3.0], + ... "int": [4, 5, None], + ... "bool": [True, False, True], + ... "str": [None, "b", "c"], + ... "str2": ["usd", "eur", None], + ... "date": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬───────┬──────┬──────┬────────────┐ + │ describe ┆ float ┆ int ┆ bool ┆ str ┆ str2 ┆ date │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ str ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪═══════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3 ┆ 2 ┆ 2 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ null ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ False ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 2.8 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ True ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴───────┴──────┴──────┴────────────┘ + + ''' + def get_column_index(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.get_column_index("ham") + 2 + + ''' + def replace_column(self, index: int, column: Series) -> Self: + ''' + Replace a column at an index location. + + This operation is in place. + + Parameters + ---------- + index + Column index. + column + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_column(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def equals(self, other: DataFrame) -> bool: + ''' + Check whether the DataFrame is equal to another DataFrame. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + See Also + -------- + assert_frame_equal + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.equals(df1) + True + >>> df1.equals(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The `GroupBy` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `group_by_dynamic` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling operation on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\', \'outer_coalesce\'} + Join strategy. + + * *inner* + Returns rows that have matching values in both tables + * *left* + Returns all rows from the left table, and the matched rows from the + right table + * *outer* + Returns all rows when there is a match in either left or right table + * *outer_coalesce* + Same as \'outer\', but coalesces the key columns + * *cross* + Returns the cartisian product of rows from both tables + * *semi* + Filter rows that have a match in the right table. + * *anti* + Filter rows that not have a match in the right table. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + join_nulls + Join on null values. By default null values will never produce matches. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 5) + ┌──────┬──────┬──────┬───────┬───────────┐ + │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞══════╪══════╪══════╪═══════╪═══════════╡ + │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │ + │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │ + │ null ┆ null ┆ null ┆ z ┆ d │ + │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │ + └──────┴──────┴──────┴───────┴───────────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see `pl.StringCache()`. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: `udf(row)`. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level `apply` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level `apply` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, other: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of + this `DataFrame`, `extend` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer `vstack` over `extend` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single `DataFrame`. In the latter case, finish the sequence of + `vstack` operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this DataFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Returns + ------- + Series + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill `value`. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> df.melt(id_vars="a", value_vars=cs.numeric()) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to `None` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying `as_dict=True`. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, n: int = ...) -> DataFrame: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> df.shift() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.shift(-2) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.shift(-2, fill_value=100) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`max_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def max_horizontal(self) -> Series: + ''' + Get the maximum value horizontally across columns. + + Returns + ------- + Series + A Series named `"max"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.max_horizontal() + shape: (3,) + Series: \'max\' [f64] + [ + 4.0 + 5.0 + 6.0 + ] + ''' + def min(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`min_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def min_horizontal(self) -> Series: + ''' + Get the minimum value horizontally across columns. + + Returns + ------- + Series + A Series named `"min"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.min_horizontal() + shape: (3,) + Series: \'min\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`sum_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + ''' + def sum_horizontal(self) -> Series: + ''' + Sum all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"sum"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.sum_horizontal() + shape: (3,) + Series: \'sum\' [f64] + [ + 5.0 + 7.0 + 9.0 + ] + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`mean_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + ''' + def mean_horizontal(self) -> Series: + ''' + Take the mean of all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"mean"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.mean_horizontal() + shape: (3,) + Series: \'mean\' [f64] + [ + 2.5 + 3.5 + 4.5 + ] + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to `None` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the `DataFrame` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + - Int8 + Utf8 = Utf8 + - Float32 + Int64 = Float32 + - Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The `index` and `by_predicate` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using `by_predicate` it is an error condition if anything other than + one row is returned; more than one row raises `TooManyRowsReturnedError`, and + zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of `iter_rows()` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify `named=True` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use `by_predicate` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using `iter_rows` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_columns(self) -> Iterator[Series]: + ''' + Returns an iterator over the DataFrame\'s columns. + + Notes + ----- + Consider whether you can use :func:`all` instead. + If you can, it will be more efficient. + + Returns + ------- + Iterator of Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [s.name for s in df.iter_columns()] + [\'a\', \'b\'] + + If you\'re using this to modify a dataframe\'s columns, e.g. + + >>> # Do NOT do this + >>> pl.DataFrame(column * 2 for column in df.iter_columns()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + + then consider whether you can use :func:`all` instead: + + >>> df.select(pl.all() * 2) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def gather_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.gather_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash_rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str = ...) -> Series: + ''' + Convert a `DataFrame` to a `Series` of type `Struct`. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy `corrcoef` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy `corrcoef`. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the values in `other`. + + .. warning:: + This functionality is experimental and may change without it being + considered a breaking change. + + By default, null values in the right frame are ignored. Use + `include_nulls=False` to overwrite values in this frame with + null values in the other frame. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce + when `include_nulls = False` + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> df.update(new_df, how="inner") + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update( + ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with this value. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> DataFrame: + """ + Take every nth row in the DataFrame and return as a new DataFrame. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def find_idx_by_name(self, name: str) -> int: + """ + Find the index of a column by name. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`get_column_index`. + + Parameters + ---------- + name + Name of the column to find. + """ + def insert_at_idx(self, index: int, column: Series) -> Self: + """ + Insert a Series at a certain column index. This operation is in place. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`insert_column`. + + Parameters + ---------- + index + Column to insert the new `Series` column. + column + `Series` to insert. + """ + def replace_at_idx(self, index: int, new_column: Series) -> Self: + """ + Replace a column at an index location. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`replace_column`. + + Parameters + ---------- + index + Column index. + new_column + Series that will replace the column. + """ + def frame_equal(self, other: DataFrame) -> bool: + """ + Check whether the DataFrame is equal to another DataFrame. + + .. deprecated:: 0.19.16 + This method has been renamed to :func:`equals`. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/expr/expr deleted file mode 100644 index 5131d44..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/expr/expr +++ /dev/null @@ -1,8289 +0,0 @@ -import P -import np as np -import pl -from builtins import PyExpr -from datetime import timedelta -from polars.datatypes.classes import Categorical as Categorical, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 -from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy -from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import no_default as no_default, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence - -TYPE_CHECKING: bool -py_arg_where: builtin_function_or_method -pyreduce: builtin_function_or_method - -class Expr: - _pyexpr: _ClassVar[None] = ... - _accessors: _ClassVar[set] = ... - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _repr_html_(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int | bool) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int | bool) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr | int | bool) -> Self: ... - def __rxor__(self, other: Any) -> Self: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: - """Numpy universal functions.""" - @classmethod - def from_json(cls, value: str) -> Self: - """ - Read an expression from a JSON encoded string to construct an Expression. - - Parameters - ---------- - value - JSON encoded string value - - """ - def to_physical(self) -> Self: - ''' - Cast to physical representation of the logical dtype. - - - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` - - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` - - `List(inner)` -> `List(physical of inner)` - - Other data types will be left unchanged. - - Examples - -------- - Replicating the pandas - `pd.factorize - `_ - function. - - >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( - ... [ - ... pl.col("vals").cast(pl.Categorical), - ... pl.col("vals") - ... .cast(pl.Categorical) - ... .to_physical() - ... .alias("vals_physical"), - ... ] - ... ) - shape: (4, 2) - ┌──────┬───────────────┐ - │ vals ┆ vals_physical │ - │ --- ┆ --- │ - │ cat ┆ u32 │ - ╞══════╪═══════════════╡ - │ a ┆ 0 │ - │ x ┆ 1 │ - │ null ┆ null │ - │ a ┆ 0 │ - └──────┴───────────────┘ - - ''' - def any(self) -> Self: - ''' - Return whether any of the values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is null. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [True, False], - ... "b": [False, False], - ... "c": [None, False], - ... } - ... ) - >>> df.select(pl.col("*").any()) - shape: (1, 3) - ┌──────┬───────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪═══════╡ - │ true ┆ false ┆ false │ - └──────┴───────┴───────┘ - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> df.select(pl.col("*").any(ignore_nulls=False)) - shape: (1, 3) - ┌──────┬───────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪══════╡ - │ true ┆ false ┆ null │ - └──────┴───────┴──────┘ - - ''' - def all(self) -> Self: - ''' - Return whether all values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - .. note:: - This method is not to be confused with the function :func:`polars.all`, - which can be used to select all columns. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is null. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [True, True], - ... "b": [False, True], - ... "c": [None, True], - ... } - ... ) - >>> df.select(pl.col("*").all()) - shape: (1, 3) - ┌──────┬───────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪══════╡ - │ true ┆ false ┆ true │ - └──────┴───────┴──────┘ - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> df.select(pl.col("*").all(ignore_nulls=False)) - shape: (1, 3) - ┌──────┬───────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪══════╡ - │ true ┆ false ┆ null │ - └──────┴───────┴──────┘ - - ''' - def arg_true(self) -> Self: - ''' - Return indices where expression evaluates `True`. - - .. warning:: - Modifies number of rows returned, so will fail in combination with other - expressions. Use as only expression in `select` / `with_columns`. - - See Also - -------- - Series.arg_true : Return indices where Series is True - polars.arg_where - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) - >>> df.select((pl.col("a") == 1).arg_true()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 0 │ - │ 1 │ - │ 3 │ - └─────┘ - - ''' - def sqrt(self) -> Self: - ''' - Compute the square root of the elements. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").sqrt()) - shape: (3, 1) - ┌──────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.0 │ - │ 1.414214 │ - │ 2.0 │ - └──────────┘ - - ''' - def cbrt(self) -> Self: - ''' - Compute the cube root of the elements. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").cbrt()) - shape: (3, 1) - ┌──────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.0 │ - │ 1.259921 │ - │ 1.587401 │ - └──────────┘ - - ''' - def log10(self) -> Self: - ''' - Compute the base 10 logarithm of the input array, element-wise. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").log10()) - shape: (3, 1) - ┌─────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞═════════╡ - │ 0.0 │ - │ 0.30103 │ - │ 0.60206 │ - └─────────┘ - - ''' - def exp(self) -> Self: - ''' - Compute the exponential, element-wise. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").exp()) - shape: (3, 1) - ┌──────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 2.718282 │ - │ 7.389056 │ - │ 54.59815 │ - └──────────┘ - - ''' - def alias(self, name: str) -> Self: - ''' - Rename the expression. - - Parameters - ---------- - name - The new name. - - See Also - -------- - map - prefix - suffix - - Examples - -------- - Rename an expression to avoid overwriting an existing column. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["x", "y", "z"], - ... } - ... ) - >>> df.with_columns( - ... pl.col("a") + 10, - ... pl.col("b").str.to_uppercase().alias("c"), - ... ) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 11 ┆ x ┆ X │ - │ 12 ┆ y ┆ Y │ - │ 13 ┆ z ┆ Z │ - └─────┴─────┴─────┘ - - Overwrite the default name of literal columns to prevent errors due to duplicate - column names. - - >>> df.with_columns( - ... pl.lit(True).alias("c"), - ... pl.lit(4.0).alias("d"), - ... ) - shape: (3, 4) - ┌─────┬─────┬──────┬─────┐ - │ a ┆ b ┆ c ┆ d │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ bool ┆ f64 │ - ╞═════╪═════╪══════╪═════╡ - │ 1 ┆ x ┆ true ┆ 4.0 │ - │ 2 ┆ y ┆ true ┆ 4.0 │ - │ 3 ┆ z ┆ true ┆ 4.0 │ - └─────┴─────┴──────┴─────┘ - - ''' - def map_alias(self, function: Callable[[str], str]) -> Self: - ''' - Rename the output of an expression by mapping a function over the root name. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.map`. - - Parameters - ---------- - function - Function that maps a root name to a new name. - - See Also - -------- - keep_name - prefix - suffix - - Examples - -------- - Remove a common suffix and convert to lower case. - - >>> df = pl.DataFrame( - ... { - ... "A_reverse": [3, 2, 1], - ... "B_reverse": ["z", "y", "x"], - ... } - ... ) - >>> df.with_columns( - ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) - ... ) - shape: (3, 4) - ┌───────────┬───────────┬─────┬─────┐ - │ A_reverse ┆ B_reverse ┆ a ┆ b │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═══════════╪═══════════╪═════╪═════╡ - │ 3 ┆ z ┆ 1 ┆ x │ - │ 2 ┆ y ┆ 2 ┆ y │ - │ 1 ┆ x ┆ 3 ┆ z │ - └───────────┴───────────┴─────┴─────┘ - - ''' - def prefix(self, prefix: str) -> Self: - ''' - Add a prefix to the root column name of the expression. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.prefix`. - - Parameters - ---------- - prefix - Prefix to add to the root column name. - - Notes - ----- - This will undo any previous renaming operations on the expression. - - Due to implementation constraints, this method can only be called as the last - expression in a chain. - - See Also - -------- - suffix - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["x", "y", "z"], - ... } - ... ) - >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) - shape: (3, 4) - ┌─────┬─────┬───────────┬───────────┐ - │ a ┆ b ┆ reverse_a ┆ reverse_b │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪═════╪═══════════╪═══════════╡ - │ 1 ┆ x ┆ 3 ┆ z │ - │ 2 ┆ y ┆ 2 ┆ y │ - │ 3 ┆ z ┆ 1 ┆ x │ - └─────┴─────┴───────────┴───────────┘ - - ''' - def suffix(self, suffix: str) -> Self: - ''' - Add a suffix to the root column name of the expression. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.suffix`. - - Parameters - ---------- - suffix - Suffix to add to the root column name. - - Notes - ----- - This will undo any previous renaming operations on the expression. - - Due to implementation constraints, this method can only be called as the last - expression in a chain. - - See Also - -------- - prefix - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["x", "y", "z"], - ... } - ... ) - >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) - shape: (3, 4) - ┌─────┬─────┬───────────┬───────────┐ - │ a ┆ b ┆ a_reverse ┆ b_reverse │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪═════╪═══════════╪═══════════╡ - │ 1 ┆ x ┆ 3 ┆ z │ - │ 2 ┆ y ┆ 2 ┆ y │ - │ 3 ┆ z ┆ 1 ┆ x │ - └─────┴─────┴───────────┴───────────┘ - - ''' - def keep_name(self) -> Self: - ''' - Keep the original root name of the expression. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.keep`. - - Notes - ----- - Due to implementation constraints, this method can only be called as the last - expression in a chain. - - See Also - -------- - alias - - Examples - -------- - Undo an alias operation. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2], - ... "b": [3, 4], - ... } - ... ) - >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 9 ┆ 3 │ - │ 18 ┆ 4 │ - └─────┴─────┘ - - Prevent errors due to duplicate column names. - - >>> df.select((pl.lit(10) / pl.all()).name.keep()) - shape: (2, 2) - ┌──────┬──────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪══════════╡ - │ 10.0 ┆ 3.333333 │ - │ 5.0 ┆ 2.5 │ - └──────┴──────────┘ - - ''' - def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: - ''' - Exclude columns from a multi-column expression. - - Only works after a wildcard or regex column selection, and you cannot provide - both string column names *and* dtypes (you may prefer to use selectors instead). - - Parameters - ---------- - columns - The name or datatype of the column(s) to exclude. Accepts regular expression - input. Regular expressions should start with `^` and end with `$`. - *more_columns - Additional names or datatypes of columns to exclude, specified as positional - arguments. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "aa": [1, 2, 3], - ... "ba": ["a", "b", None], - ... "cc": [None, 2.5, 1.5], - ... } - ... ) - >>> df - shape: (3, 3) - ┌─────┬──────┬──────┐ - │ aa ┆ ba ┆ cc │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ f64 │ - ╞═════╪══════╪══════╡ - │ 1 ┆ a ┆ null │ - │ 2 ┆ b ┆ 2.5 │ - │ 3 ┆ null ┆ 1.5 │ - └─────┴──────┴──────┘ - - Exclude by column name(s): - - >>> df.select(pl.all().exclude("ba")) - shape: (3, 2) - ┌─────┬──────┐ - │ aa ┆ cc │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ null │ - │ 2 ┆ 2.5 │ - │ 3 ┆ 1.5 │ - └─────┴──────┘ - - Exclude by regex, e.g. removing all columns whose names end with the letter "a": - - >>> df.select(pl.all().exclude("^.*a$")) - shape: (3, 1) - ┌──────┐ - │ cc │ - │ --- │ - │ f64 │ - ╞══════╡ - │ null │ - │ 2.5 │ - │ 1.5 │ - └──────┘ - - Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: - - >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) - shape: (3, 1) - ┌──────┐ - │ ba │ - │ --- │ - │ str │ - ╞══════╡ - │ a │ - │ b │ - │ null │ - └──────┘ - - ''' - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: - ''' - Offers a structured way to apply a sequence of user-defined functions (UDFs). - - Parameters - ---------- - function - Callable; will receive the expression as the first parameter, - followed by any given args/kwargs. - *args - Arguments to pass to the UDF. - **kwargs - Keyword arguments to pass to the UDF. - - Examples - -------- - >>> def extract_number(expr: pl.Expr) -> pl.Expr: - ... """Extract the digits from a string.""" - ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) - >>> - >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: - ... """Set even numbers negative, and scale by a user-supplied value.""" - ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) - ... return expr * n - >>> - >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) - >>> df.with_columns( - ... udfs=( - ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) - ... ), - ... ) - shape: (4, 2) - ┌──────┬──────┐ - │ val ┆ udfs │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞══════╪══════╡ - │ a: 1 ┆ 5 │ - │ b: 2 ┆ -10 │ - │ c: 3 ┆ 15 │ - │ d: 4 ┆ -20 │ - └──────┴──────┘ - - ''' - def is_not(self) -> Self: - """ - Negate a boolean expression. - - .. deprecated:: 0.19.2 - This method has been renamed to :func:`Expr.not_`. - - """ - def not_(self) -> Self: - ''' - Negate a boolean expression. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [True, False, False], - ... "b": ["a", "b", None], - ... } - ... ) - >>> df - shape: (3, 2) - ┌───────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ bool ┆ str │ - ╞═══════╪══════╡ - │ true ┆ a │ - │ false ┆ b │ - │ false ┆ null │ - └───────┴──────┘ - >>> df.select(pl.col("a").not_()) - shape: (3, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ true │ - │ true │ - └───────┘ - - ''' - def is_null(self) -> Self: - ''' - Returns a boolean Series indicating which values are null. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null - shape: (5, 4) - ┌──────┬─────┬──────────┬──────────┐ - │ a ┆ b ┆ a_isnull ┆ b_isnull │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪═════╪══════════╪══════════╡ - │ 1 ┆ 1.0 ┆ false ┆ false │ - │ 2 ┆ 2.0 ┆ false ┆ false │ - │ null ┆ NaN ┆ true ┆ false │ - │ 1 ┆ 1.0 ┆ false ┆ false │ - │ 5 ┆ 5.0 ┆ false ┆ false │ - └──────┴─────┴──────────┴──────────┘ - - ''' - def is_not_null(self) -> Self: - ''' - Returns a boolean Series indicating which values are not null. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns( - ... pl.all().is_not_null().name.suffix("_not_null") # nan != null - ... ) - shape: (5, 4) - ┌──────┬─────┬────────────┬────────────┐ - │ a ┆ b ┆ a_not_null ┆ b_not_null │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪═════╪════════════╪════════════╡ - │ 1 ┆ 1.0 ┆ true ┆ true │ - │ 2 ┆ 2.0 ┆ true ┆ true │ - │ null ┆ NaN ┆ false ┆ true │ - │ 1 ┆ 1.0 ┆ true ┆ true │ - │ 5 ┆ 5.0 ┆ true ┆ true │ - └──────┴─────┴────────────┴────────────┘ - - ''' - def is_finite(self) -> Self: - ''' - Returns a boolean Series indicating which values are finite. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1.0, 2], - ... "B": [3.0, float("inf")], - ... } - ... ) - >>> df.select(pl.all().is_finite()) - shape: (2, 2) - ┌──────┬───────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ bool ┆ bool │ - ╞══════╪═══════╡ - │ true ┆ true │ - │ true ┆ false │ - └──────┴───────┘ - - ''' - def is_infinite(self) -> Self: - ''' - Returns a boolean Series indicating which values are infinite. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1.0, 2], - ... "B": [3.0, float("inf")], - ... } - ... ) - >>> df.select(pl.all().is_infinite()) - shape: (2, 2) - ┌───────┬───────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ bool ┆ bool │ - ╞═══════╪═══════╡ - │ false ┆ false │ - │ false ┆ true │ - └───────┴───────┘ - - ''' - def is_nan(self) -> Self: - ''' - Returns a boolean Series indicating which values are NaN. - - Notes - ----- - Floating point `NaN` (Not A Number) should not be confused - with missing data represented as `Null/None`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) - shape: (5, 3) - ┌──────┬─────┬─────────┐ - │ a ┆ b ┆ b_isnan │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪═════╪═════════╡ - │ 1 ┆ 1.0 ┆ false │ - │ 2 ┆ 2.0 ┆ false │ - │ null ┆ NaN ┆ true │ - │ 1 ┆ 1.0 ┆ false │ - │ 5 ┆ 5.0 ┆ false │ - └──────┴─────┴─────────┘ - - ''' - def is_not_nan(self) -> Self: - ''' - Returns a boolean Series indicating which values are not NaN. - - Notes - ----- - Floating point `NaN` (Not A Number) should not be confused - with missing data represented as `Null/None`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) - shape: (5, 3) - ┌──────┬─────┬──────────────┐ - │ a ┆ b ┆ b_is_not_nan │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪═════╪══════════════╡ - │ 1 ┆ 1.0 ┆ true │ - │ 2 ┆ 2.0 ┆ true │ - │ null ┆ NaN ┆ false │ - │ 1 ┆ 1.0 ┆ true │ - │ 5 ┆ 5.0 ┆ true │ - └──────┴─────┴──────────────┘ - - ''' - def agg_groups(self) -> Self: - ''' - Get the group indexes of the group by operation. - - Should be used in aggregation context only. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": [ - ... "one", - ... "one", - ... "one", - ... "two", - ... "two", - ... "two", - ... ], - ... "value": [94, 95, 96, 97, 97, 99], - ... } - ... ) - >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ list[u32] │ - ╞═══════╪═══════════╡ - │ one ┆ [0, 1, 2] │ - │ two ┆ [3, 4, 5] │ - └───────┴───────────┘ - - ''' - def count(self) -> Self: - ''' - Return the number of elements in the column. - - .. warning:: - Null values are treated like regular elements in this context. - - Examples - -------- - >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) - >>> df.select(pl.all().count()) - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 3 ┆ 3 │ - └─────┴─────┘ - - ''' - def len(self) -> Self: - ''' - Return the number of elements in the column. - - Null values are treated like regular elements in this context. - - Alias for :func:`count`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) - >>> df.select(pl.all().len()) - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 3 ┆ 3 │ - └─────┴─────┘ - - ''' - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: - ''' - Get a slice of this expression. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [8, 9, 10, 11], - ... "b": [None, 4, 4, 4], - ... } - ... ) - >>> df.select(pl.all().slice(1, 2)) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 9 ┆ 4 │ - │ 10 ┆ 4 │ - └─────┴─────┘ - - ''' - def append(self, other: IntoExpr) -> Self: - ''' - Append expressions. - - This is done by adding the chunks of `other` to this `Series`. - - Parameters - ---------- - other - Expression to append. - upcast - Cast both `Series` to the same supertype. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [8, 9, 10], - ... "b": [None, 4, 4], - ... } - ... ) - >>> df.select(pl.all().head(1).append(pl.all().tail(1))) - shape: (2, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 8 ┆ null │ - │ 10 ┆ 4 │ - └─────┴──────┘ - - ''' - def rechunk(self) -> Self: - ''' - Create a single chunk of memory for this Series. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - - Create a Series with 3 nulls, append column a then rechunk - - >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) - shape: (6, 1) - ┌────────┐ - │ repeat │ - │ --- │ - │ i64 │ - ╞════════╡ - │ null │ - │ null │ - │ null │ - │ 1 │ - │ 1 │ - │ 2 │ - └────────┘ - - ''' - def drop_nulls(self) -> Self: - ''' - Drop all null values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nans - - Notes - ----- - A null value is not the same as a NaN value. - To drop NaN values, use :func:`drop_nans`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) - >>> df.select(pl.col("a").drop_nulls()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - │ 3.0 │ - │ NaN │ - └─────┘ - - ''' - def drop_nans(self) -> Self: - ''' - Drop all floating point NaN values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nulls - - Notes - ----- - A NaN value is not the same as a null value. - To drop null values, use :func:`drop_nulls`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) - >>> df.select(pl.col("a").drop_nans()) - shape: (3, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ 1.0 │ - │ null │ - │ 3.0 │ - └──────┘ - - ''' - def cum_sum(self) -> Self: - ''' - Get an array with the cumulative sum computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_sum().alias("cum_sum"), - ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), - ... ) - shape: (4, 3) - ┌─────┬─────────┬─────────────────┐ - │ a ┆ cum_sum ┆ cum_sum_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════════╪═════════════════╡ - │ 1 ┆ 1 ┆ 10 │ - │ 2 ┆ 3 ┆ 9 │ - │ 3 ┆ 6 ┆ 7 │ - │ 4 ┆ 10 ┆ 4 │ - └─────┴─────────┴─────────────────┘ - - Null values are excluded, but can also be filled by calling `forward_fill`. - - >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) - >>> df.with_columns( - ... pl.col("values").cum_sum().alias("value_cum_sum"), - ... pl.col("values") - ... .cum_sum() - ... .forward_fill() - ... .alias("value_cum_sum_all_filled"), - ... ) - shape: (8, 3) - ┌────────┬───────────────┬──────────────────────────┐ - │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞════════╪═══════════════╪══════════════════════════╡ - │ null ┆ null ┆ null │ - │ 10 ┆ 10 ┆ 10 │ - │ null ┆ null ┆ 10 │ - │ 8 ┆ 18 ┆ 18 │ - │ 9 ┆ 27 ┆ 27 │ - │ null ┆ null ┆ 27 │ - │ 16 ┆ 43 ┆ 43 │ - │ null ┆ null ┆ 43 │ - └────────┴───────────────┴──────────────────────────┘ - - ''' - def cum_prod(self) -> Self: - ''' - Get an array with the cumulative product computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_prod().alias("cum_prod"), - ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), - ... ) - shape: (4, 3) - ┌─────┬──────────┬──────────────────┐ - │ a ┆ cum_prod ┆ cum_prod_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪══════════╪══════════════════╡ - │ 1 ┆ 1 ┆ 24 │ - │ 2 ┆ 2 ┆ 24 │ - │ 3 ┆ 6 ┆ 12 │ - │ 4 ┆ 24 ┆ 4 │ - └─────┴──────────┴──────────────────┘ - - ''' - def cum_min(self) -> Self: - ''' - Get an array with the cumulative min computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_min().alias("cum_min"), - ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), - ... ) - shape: (4, 3) - ┌─────┬─────────┬─────────────────┐ - │ a ┆ cum_min ┆ cum_min_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════════╪═════════════════╡ - │ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 1 ┆ 2 │ - │ 3 ┆ 1 ┆ 3 │ - │ 4 ┆ 1 ┆ 4 │ - └─────┴─────────┴─────────────────┘ - - ''' - def cum_max(self) -> Self: - ''' - Get an array with the cumulative max computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_max().alias("cum_max"), - ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), - ... ) - shape: (4, 3) - ┌─────┬─────────┬─────────────────┐ - │ a ┆ cum_max ┆ cum_max_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════════╪═════════════════╡ - │ 1 ┆ 1 ┆ 4 │ - │ 2 ┆ 2 ┆ 4 │ - │ 3 ┆ 3 ┆ 4 │ - │ 4 ┆ 4 ┆ 4 │ - └─────┴─────────┴─────────────────┘ - - Null values are excluded, but can also be filled by calling `forward_fill`. - - >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) - >>> df.with_columns( - ... pl.col("values").cum_max().alias("cum_max"), - ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), - ... ) - shape: (8, 3) - ┌────────┬─────────┬────────────────────┐ - │ values ┆ cum_max ┆ cum_max_all_filled │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞════════╪═════════╪════════════════════╡ - │ null ┆ null ┆ null │ - │ 10 ┆ 10 ┆ 10 │ - │ null ┆ null ┆ 10 │ - │ 8 ┆ 10 ┆ 10 │ - │ 9 ┆ 10 ┆ 10 │ - │ null ┆ null ┆ 10 │ - │ 16 ┆ 16 ┆ 16 │ - │ null ┆ null ┆ 16 │ - └────────┴─────────┴────────────────────┘ - - ''' - def cum_count(self) -> Self: - ''' - Get an array with the cumulative count computed at every element. - - Counting from 0 to len - - Parameters - ---------- - reverse - Reverse the operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_count().alias("cum_count"), - ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), - ... ) - shape: (4, 3) - ┌─────┬───────────┬───────────────────┐ - │ a ┆ cum_count ┆ cum_count_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ u32 ┆ u32 │ - ╞═════╪═══════════╪═══════════════════╡ - │ 1 ┆ 0 ┆ 3 │ - │ 2 ┆ 1 ┆ 2 │ - │ 3 ┆ 2 ┆ 1 │ - │ 4 ┆ 3 ┆ 0 │ - └─────┴───────────┴───────────────────┘ - - ''' - def floor(self) -> Self: - ''' - Rounds down to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) - >>> df.select(pl.col("a").floor()) - shape: (4, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - │ 0.0 │ - │ 1.0 │ - │ 1.0 │ - └─────┘ - - ''' - def ceil(self) -> Self: - ''' - Rounds up to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) - >>> df.select(pl.col("a").ceil()) - shape: (4, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - │ 1.0 │ - │ 1.0 │ - │ 2.0 │ - └─────┘ - - ''' - def round(self, decimals: int = ...) -> Self: - ''' - Round underlying floating point data by `decimals` digits. - - Parameters - ---------- - decimals - Number of decimals to round by. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) - >>> df.select(pl.col("a").round(1)) - shape: (4, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.3 │ - │ 0.5 │ - │ 1.0 │ - │ 1.2 │ - └─────┘ - - ''' - def round_sig_figs(self, digits: int) -> Self: - ''' - Round to a number of significant figures. - - Parameters - ---------- - digits - Number of significant figures to round to. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) - >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) - shape: (3, 2) - ┌─────────┬────────────────┐ - │ a ┆ round_sig_figs │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════════╪════════════════╡ - │ 0.01234 ┆ 0.012 │ - │ 3.333 ┆ 3.3 │ - │ 1234.0 ┆ 1200.0 │ - └─────────┴────────────────┘ - - ''' - def dot(self, other: Expr | str) -> Self: - ''' - Compute the dot/inner product between two Expressions. - - Parameters - ---------- - other - Expression to compute dot product with. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> df.select(pl.col("a").dot(pl.col("b"))) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 44 │ - └─────┘ - - ''' - def mode(self) -> Self: - ''' - Compute the most occurring value(s). - - Can return multiple Values. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 1, 2, 3], - ... "b": [1, 1, 2, 2], - ... } - ... ) - >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 1 │ - │ 1 ┆ 2 │ - └─────┴─────┘ - - ''' - def cast(self, dtype: PolarsDataType | type[Any]) -> Self: - ''' - Cast between data types. - - Parameters - ---------- - dtype - DataType to cast to. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["4", "5", "6"], - ... } - ... ) - >>> df.with_columns( - ... [ - ... pl.col("a").cast(pl.Float64), - ... pl.col("b").cast(pl.Int32), - ... ] - ... ) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ i32 │ - ╞═════╪═════╡ - │ 1.0 ┆ 4 │ - │ 2.0 ┆ 5 │ - │ 3.0 ┆ 6 │ - └─────┴─────┘ - - ''' - def sort(self) -> Self: - ''' - Sort this column. - - When used in a projection/selection context, the whole column is sorted. - When used in a group by context, the groups are sorted. - - Parameters - ---------- - descending - Sort in descending order. - nulls_last - Place null values last. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, None, 3, 2], - ... } - ... ) - >>> df.select(pl.col("a").sort()) - shape: (4, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ null │ - │ 1 │ - │ 2 │ - │ 3 │ - └──────┘ - >>> df.select(pl.col("a").sort(descending=True)) - shape: (4, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ null │ - │ 3 │ - │ 2 │ - │ 1 │ - └──────┘ - >>> df.select(pl.col("a").sort(nulls_last=True)) - shape: (4, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ 1 │ - │ 2 │ - │ 3 │ - │ null │ - └──────┘ - - When sorting in a group by context, the groups are sorted. - - >>> df = pl.DataFrame( - ... { - ... "group": ["one", "one", "one", "two", "two", "two"], - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT - shape: (2, 2) - ┌───────┬────────────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪════════════╡ - │ two ┆ [3, 4, 99] │ - │ one ┆ [1, 2, 98] │ - └───────┴────────────┘ - - ''' - def top_k(self, k: int | IntoExprColumn = ...) -> Self: - ''' - Return the `k` largest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - bottom_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("value").top_k().alias("top_k"), - ... pl.col("value").bottom_k().alias("bottom_k"), - ... ] - ... ) - shape: (5, 2) - ┌───────┬──────────┐ - │ top_k ┆ bottom_k │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═══════╪══════════╡ - │ 99 ┆ 1 │ - │ 98 ┆ 2 │ - │ 4 ┆ 3 │ - │ 3 ┆ 4 │ - │ 2 ┆ 98 │ - └───────┴──────────┘ - - ''' - def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: - ''' - Return the `k` smallest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - top_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("value").top_k().alias("top_k"), - ... pl.col("value").bottom_k().alias("bottom_k"), - ... ] - ... ) - shape: (5, 2) - ┌───────┬──────────┐ - │ top_k ┆ bottom_k │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═══════╪══════════╡ - │ 99 ┆ 1 │ - │ 98 ┆ 2 │ - │ 4 ┆ 3 │ - │ 3 ┆ 4 │ - │ 2 ┆ 98 │ - └───────┴──────────┘ - - ''' - def arg_sort(self) -> Self: - ''' - Get the index values that would sort this column. - - Parameters - ---------- - descending - Sort in descending (descending) order. - nulls_last - Place null values last instead of first. - - Returns - ------- - Expr - Expression of data type :class:`UInt32`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [20, 10, 30], - ... } - ... ) - >>> df.select(pl.col("a").arg_sort()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 1 │ - │ 0 │ - │ 2 │ - └─────┘ - - ''' - def arg_max(self) -> Self: - ''' - Get the index of the maximal value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [20, 10, 30], - ... } - ... ) - >>> df.select(pl.col("a").arg_max()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def arg_min(self) -> Self: - ''' - Get the index of the minimal value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [20, 10, 30], - ... } - ... ) - >>> df.select(pl.col("a").arg_min()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 1 │ - └─────┘ - - ''' - def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: - ''' - Find indices where elements should be inserted to maintain order. - - .. math:: a[i-1] < v <= a[i] - - Parameters - ---------- - element - Expression or scalar value. - side : {\'any\', \'left\', \'right\'} - If \'any\', the index of the first suitable location found is given. - If \'left\', the index of the leftmost suitable location found is given. - If \'right\', return the rightmost suitable location found is given. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "values": [1, 2, 3, 5], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("values").search_sorted(0).alias("zero"), - ... pl.col("values").search_sorted(3).alias("three"), - ... pl.col("values").search_sorted(6).alias("six"), - ... ] - ... ) - shape: (1, 3) - ┌──────┬───────┬─────┐ - │ zero ┆ three ┆ six │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ u32 │ - ╞══════╪═══════╪═════╡ - │ 0 ┆ 2 ┆ 4 │ - └──────┴───────┴─────┘ - - ''' - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: - ''' - Sort this column by the ordering of other columns. - - When used in a projection/selection context, the whole column is sorted. - When used in a group by context, the groups are sorted. - - Parameters - ---------- - by - Column(s) to sort by. Accepts expression input. Strings are parsed as column - names. - *more_by - Additional columns to sort by, specified as positional arguments. - descending - Sort in descending order. When sorting by multiple columns, can be specified - per column by passing a sequence of booleans. - - Examples - -------- - Pass a single column name to sort by that column. - - >>> df = pl.DataFrame( - ... { - ... "group": ["a", "a", "b", "b"], - ... "value1": [1, 3, 4, 2], - ... "value2": [8, 7, 6, 5], - ... } - ... ) - >>> df.select(pl.col("group").sort_by("value1")) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ a │ - │ b │ - │ a │ - │ b │ - └───────┘ - - Sorting by expressions is also supported. - - >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ b │ - │ a │ - │ a │ - │ b │ - └───────┘ - - Sort by multiple columns by passing a list of columns. - - >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ b │ - │ a │ - │ b │ - │ a │ - └───────┘ - - Or use positional arguments to sort by multiple columns in the same way. - - >>> df.select(pl.col("group").sort_by("value1", "value2")) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ a │ - │ b │ - │ a │ - │ b │ - └───────┘ - - When sorting in a group by context, the groups are sorted. - - >>> df.group_by("group").agg( - ... pl.col("value1").sort_by("value2") - ... ) # doctest: +IGNORE_RESULT - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ value1 │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪═══════════╡ - │ a ┆ [3, 1] │ - │ b ┆ [2, 4] │ - └───────┴───────────┘ - - Take a single row from each group where a column attains its minimal value - within that group. - - >>> df.group_by("group").agg( - ... pl.all().sort_by("value2").first() - ... ) # doctest: +IGNORE_RESULT - shape: (2, 3) - ┌───────┬────────┬────────┐ - │ group ┆ value1 ┆ value2 | - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 | - ╞═══════╪════════╪════════╡ - │ a ┆ 3 ┆ 7 | - │ b ┆ 2 ┆ 5 | - └───────┴────────┴────────┘ - - ''' - def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: - ''' - Take values by index. - - Parameters - ---------- - indices - An expression that leads to a UInt32 dtyped Series. - - Returns - ------- - Expr - Expression of the same data type. - - See Also - -------- - Expr.get : Take a single value - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": [ - ... "one", - ... "one", - ... "one", - ... "two", - ... "two", - ... "two", - ... ], - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.group_by("group", maintain_order=True).agg( - ... pl.col("value").gather([2, 1]) - ... ) - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪═══════════╡ - │ one ┆ [2, 98] │ - │ two ┆ [4, 99] │ - └───────┴───────────┘ - ''' - def get(self, index: int | Expr) -> Self: - ''' - Return a single value by index. - - Parameters - ---------- - index - An expression that leads to a UInt32 index. - - Returns - ------- - Expr - Expression of the same data type. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": [ - ... "one", - ... "one", - ... "one", - ... "two", - ... "two", - ... "two", - ... ], - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) - shape: (2, 2) - ┌───────┬───────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═══════╪═══════╡ - │ one ┆ 98 │ - │ two ┆ 99 │ - └───────┴───────┘ - - ''' - def shift(self, n: int | IntoExprColumn = ...) -> Self: - ''' - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns(shift=pl.col("a").shift()) - shape: (4, 2) - ┌─────┬───────┐ - │ a ┆ shift │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═══════╡ - │ 1 ┆ null │ - │ 2 ┆ 1 │ - │ 3 ┆ 2 │ - │ 4 ┆ 3 │ - └─────┴───────┘ - - Pass a negative value to shift in the opposite direction instead. - - >>> df.with_columns(shift=pl.col("a").shift(-2)) - shape: (4, 2) - ┌─────┬───────┐ - │ a ┆ shift │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═══════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - │ 3 ┆ null │ - │ 4 ┆ null │ - └─────┴───────┘ - - Specify `fill_value` to fill the resulting null values. - - >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) - shape: (4, 2) - ┌─────┬───────┐ - │ a ┆ shift │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═══════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - │ 3 ┆ 100 │ - │ 4 ┆ 100 │ - └─────┴───────┘ - - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: - ''' - Fill null values using the specified value or strategy. - - To interpolate over null values see interpolate. - See the examples below to fill nulls with an expression. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [4, None, 6], - ... } - ... ) - >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 0 │ - │ null ┆ 6 │ - └──────┴─────┘ - >>> df.with_columns(pl.col("b").fill_null(99)) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 99 │ - │ null ┆ 6 │ - └──────┴─────┘ - >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 4 │ - │ null ┆ 6 │ - └──────┴─────┘ - >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞══════╪═════╡ - │ 1 ┆ 4.0 │ - │ 2 ┆ 5.0 │ - │ null ┆ 6.0 │ - └──────┴─────┘ - >>> df.with_columns(pl.all().fill_null(pl.all().median())) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 1.0 ┆ 4.0 │ - │ 2.0 ┆ 5.0 │ - │ 1.5 ┆ 6.0 │ - └─────┴─────┘ - - ''' - def fill_nan(self, value: int | float | Expr | None) -> Self: - ''' - Fill floating point NaN value with a fill value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1.0, None, float("nan")], - ... "b": [4.0, float("nan"), 6], - ... } - ... ) - >>> df.with_columns(pl.col("b").fill_nan(0)) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪═════╡ - │ 1.0 ┆ 4.0 │ - │ null ┆ 0.0 │ - │ NaN ┆ 6.0 │ - └──────┴─────┘ - - ''' - def forward_fill(self, limit: int | None = ...) -> Self: - ''' - Fill missing values with the latest seen values. - - Parameters - ---------- - limit - The number of consecutive null values to forward fill. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [4, None, 6], - ... } - ... ) - >>> df.select(pl.all().forward_fill()) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 4 │ - │ 2 ┆ 6 │ - └─────┴─────┘ - - ''' - def backward_fill(self, limit: int | None = ...) -> Self: - ''' - Fill missing values with the next to be seen values. - - Parameters - ---------- - limit - The number of consecutive null values to backward fill. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [4, None, 6], - ... "c": [None, None, 2], - ... } - ... ) - >>> df.select(pl.all().backward_fill()) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞══════╪═════╪═════╡ - │ 1 ┆ 4 ┆ 2 │ - │ 2 ┆ 6 ┆ 2 │ - │ null ┆ 6 ┆ 2 │ - └──────┴─────┴─────┘ - >>> df.select(pl.all().backward_fill(limit=1)) - shape: (3, 3) - ┌──────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞══════╪═════╪══════╡ - │ 1 ┆ 4 ┆ null │ - │ 2 ┆ 6 ┆ 2 │ - │ null ┆ 6 ┆ 2 │ - └──────┴─────┴──────┘ - - ''' - def reverse(self) -> Self: - ''' - Reverse the selection. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4, 5], - ... "fruits": ["banana", "banana", "apple", "apple", "banana"], - ... "B": [5, 4, 3, 2, 1], - ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], - ... } - ... ) - >>> df.select( - ... [ - ... pl.all(), - ... pl.all().reverse().name.suffix("_reverse"), - ... ] - ... ) - shape: (5, 8) - ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ - │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ - │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ - │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ - │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ - │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ - │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ - └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ - - ''' - def std(self, ddof: int = ...) -> Self: - ''' - Get standard deviation. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").std()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def var(self, ddof: int = ...) -> Self: - ''' - Get variance. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").var()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def max(self) -> Self: - ''' - Get maximum value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) - >>> df.select(pl.col("a").max()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def min(self) -> Self: - ''' - Get minimum value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) - >>> df.select(pl.col("a").min()) - shape: (1, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ -1.0 │ - └──────┘ - - ''' - def nan_max(self) -> Self: - ''' - Get maximum value, but propagate/poison encountered NaN values. - - This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0, float("nan")]}) - >>> df.select(pl.col("a").nan_max()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ NaN │ - └─────┘ - - ''' - def nan_min(self) -> Self: - ''' - Get minimum value, but propagate/poison encountered NaN values. - - This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0, float("nan")]}) - >>> df.select(pl.col("a").nan_min()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ NaN │ - └─────┘ - - ''' - def sum(self) -> Self: - ''' - Get sum value. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").sum()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 0 │ - └─────┘ - - ''' - def mean(self) -> Self: - ''' - Get mean value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").mean()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def median(self) -> Self: - ''' - Get median value using linear interpolation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").median()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def product(self) -> Self: - ''' - Compute the product of an expression. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").product()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 6 │ - └─────┘ - - ''' - def n_unique(self) -> Self: - ''' - Count unique values. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").n_unique()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def approx_n_unique(self) -> Self: - ''' - Approximate count of unique values. - - This is done using the HyperLogLog++ algorithm for cardinality estimation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").approx_n_unique()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def null_count(self) -> Self: - ''' - Count null values. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [None, 1, None], - ... "b": [1, 2, 3], - ... } - ... ) - >>> df.select(pl.all().null_count()) - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 2 ┆ 0 │ - └─────┴─────┘ - - ''' - def arg_unique(self) -> Self: - ''' - Get index of first unique value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [8, 9, 10], - ... "b": [None, 4, 4], - ... } - ... ) - >>> df.select(pl.col("a").arg_unique()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 0 │ - │ 1 │ - │ 2 │ - └─────┘ - >>> df.select(pl.col("b").arg_unique()) - shape: (2, 1) - ┌─────┐ - │ b │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 0 │ - │ 1 │ - └─────┘ - - ''' - def unique(self) -> Self: - ''' - Get unique values of this expression. - - Parameters - ---------- - maintain_order - Maintain order of data. This requires more work. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - │ 1 │ - └─────┘ - >>> df.select(pl.col("a").unique(maintain_order=True)) - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - └─────┘ - - ''' - def first(self) -> Self: - ''' - Get the first value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").first()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - └─────┘ - - ''' - def last(self) -> Self: - ''' - Get the last value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").last()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: - ''' - Compute expressions over the given groups. - - This expression is similar to performing a group by aggregation and joining the - result back into the original DataFrame. - - The outcome is similar to how `window functions - `_ - work in PostgreSQL. - - Parameters - ---------- - expr - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_exprs - Additional columns to group by, specified as positional arguments. - mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} - - group_to_rows - If the aggregation results in multiple values, assign them back to their - position in the DataFrame. This can only be done if the group yields - the same elements before aggregation as after. - - join - Join the groups as \'List\' to the row positions. - warning: this can be memory intensive. - - explode - Don\'t do any mapping, but simply flatten the group. - This only makes sense if the input data is sorted. - - Examples - -------- - Pass the name of a column to compute the expression over that column. - - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "a", "b", "b", "b"], - ... "b": [1, 2, 3, 5, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> df.with_columns( - ... pl.col("c").max().over("a").name.suffix("_max"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_max │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 5 │ - │ b ┆ 3 ┆ 3 ┆ 3 │ - │ b ┆ 5 ┆ 2 ┆ 3 │ - │ b ┆ 3 ┆ 1 ┆ 3 │ - └─────┴─────┴─────┴───────┘ - - Expression input is supported. - - >>> df.with_columns( - ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_max │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 4 │ - │ b ┆ 3 ┆ 3 ┆ 4 │ - │ b ┆ 5 ┆ 2 ┆ 2 │ - │ b ┆ 3 ┆ 1 ┆ 4 │ - └─────┴─────┴─────┴───────┘ - - Group by multiple columns by passing a list of column names or expressions. - - >>> df.with_columns( - ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_min │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 4 │ - │ b ┆ 3 ┆ 3 ┆ 1 │ - │ b ┆ 5 ┆ 2 ┆ 2 │ - │ b ┆ 3 ┆ 1 ┆ 1 │ - └─────┴─────┴─────┴───────┘ - - Or use positional arguments to group by multiple columns in the same way. - - >>> df.with_columns( - ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_min │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 4 │ - │ b ┆ 3 ┆ 3 ┆ 1 │ - │ b ┆ 5 ┆ 2 ┆ 1 │ - │ b ┆ 3 ┆ 1 ┆ 1 │ - └─────┴─────┴─────┴───────┘ - - ''' - def rolling(self, index_column: str) -> Self: - ''' - Create rolling groups based on a time, Int32, or Int64 column. - - If you have a time series ``, then by default the - windows created will be - - * (t_0 - period, t_0] - * (t_1 - period, t_1] - * ... - * (t_n - period, t_n] - - whereas if you pass a non-default `offset`, then the windows will be - - * (t_0 + offset, t_0 + offset + period] - * (t_1 + offset, t_1 + offset + period] - * ... - * (t_n + offset, t_n + offset + period] - - The `period` and `offset` arguments are created either from a timedelta, or - by using the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a rolling operation on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order. - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Examples - -------- - >>> dates = [ - ... "2020-01-01 13:45:48", - ... "2020-01-01 16:42:13", - ... "2020-01-01 16:45:09", - ... "2020-01-02 18:12:48", - ... "2020-01-03 19:45:32", - ... "2020-01-08 23:16:43", - ... ] - >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( - ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() - ... ) - >>> df.with_columns( - ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), - ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), - ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), - ... ) - shape: (6, 5) - ┌─────────────────────┬─────┬───────┬───────┬───────┐ - │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ - │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ - │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ - │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ - │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ - │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ - │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ - └─────────────────────┴─────┴───────┴───────┴───────┘ - - ''' - def is_unique(self) -> Self: - ''' - Get mask of unique values. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").is_unique()) - shape: (3, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ false │ - │ true │ - └───────┘ - - ''' - def is_first_distinct(self) -> Self: - ''' - Return a boolean mask indicating the first occurrence of each distinct value. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) - >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) - shape: (5, 2) - ┌─────┬───────┐ - │ a ┆ first │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪═══════╡ - │ 1 ┆ true │ - │ 1 ┆ false │ - │ 2 ┆ true │ - │ 3 ┆ true │ - │ 2 ┆ false │ - └─────┴───────┘ - - ''' - def is_last_distinct(self) -> Self: - ''' - Return a boolean mask indicating the last occurrence of each distinct value. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) - >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) - shape: (5, 2) - ┌─────┬───────┐ - │ a ┆ last │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪═══════╡ - │ 1 ┆ false │ - │ 1 ┆ true │ - │ 2 ┆ false │ - │ 3 ┆ true │ - │ 2 ┆ true │ - └─────┴───────┘ - - ''' - def is_duplicated(self) -> Self: - ''' - Return a boolean mask indicating duplicated values. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").is_duplicated()) - shape: (3, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ true │ - │ true │ - │ false │ - └───────┘ - - ''' - def peak_max(self) -> Self: - ''' - Get a boolean mask of the local maximum peaks. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) - >>> df.select(pl.col("a").peak_max()) - shape: (5, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ false │ - │ false │ - │ false │ - │ true │ - └───────┘ - - ''' - def peak_min(self) -> Self: - ''' - Get a boolean mask of the local minimum peaks. - - Examples - -------- - >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) - >>> df.select(pl.col("a").peak_min()) - shape: (5, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ true │ - │ false │ - │ true │ - │ false │ - └───────┘ - - ''' - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: - ''' - Get quantile value. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) - >>> df.select(pl.col("a").quantile(0.3)) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 2.0 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.5 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.5 │ - └─────┘ - - ''' - def cut(self, breaks: Sequence[float]) -> Self: - ''' - Bin continuous values into discrete categories. - - Parameters - ---------- - breaks - List of unique cut points. - labels - Names of the categories. The number of labels must be equal to the number - of cut points plus one. - left_closed - Set the intervals to be left-closed instead of right-closed. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - - Returns - ------- - Expr - Expression of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise an expression of data type :class:`Struct`. - - See Also - -------- - qcut - - Examples - -------- - Divide a column into three categories. - - >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) - >>> df.with_columns( - ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") - ... ) - shape: (5, 2) - ┌─────┬─────┐ - │ foo ┆ cut │ - │ --- ┆ --- │ - │ i64 ┆ cat │ - ╞═════╪═════╡ - │ -2 ┆ a │ - │ -1 ┆ a │ - │ 0 ┆ b │ - │ 1 ┆ b │ - │ 2 ┆ c │ - └─────┴─────┘ - - Add both the category and the breakpoint. - - >>> df.with_columns( - ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") - ... ).unnest("cut") - shape: (5, 3) - ┌─────┬──────┬────────────┐ - │ foo ┆ brk ┆ foo_bin │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪══════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴──────┴────────────┘ - - ''' - def qcut(self, quantiles: Sequence[float] | int) -> Self: - ''' - Bin continuous values into discrete categories based on their quantiles. - - Parameters - ---------- - quantiles - Either a list of quantile probabilities between 0 and 1 or a positive - integer determining the number of bins with uniform probability. - labels - Names of the categories. The number of labels must be equal to the number - of categories. - left_closed - Set the intervals to be left-closed instead of right-closed. - allow_duplicates - If set to `True`, duplicates in the resulting quantiles are dropped, - rather than raising a `DuplicateError`. This can happen even with unique - probabilities, depending on the data. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - - Returns - ------- - Expr - Expression of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise an expression of data type :class:`Struct`. - - See Also - -------- - cut - - Examples - -------- - Divide a column into three categories according to pre-defined quantile - probabilities. - - >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) - >>> df.with_columns( - ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") - ... ) - shape: (5, 2) - ┌─────┬──────┐ - │ foo ┆ qcut │ - │ --- ┆ --- │ - │ i64 ┆ cat │ - ╞═════╪══════╡ - │ -2 ┆ a │ - │ -1 ┆ a │ - │ 0 ┆ b │ - │ 1 ┆ b │ - │ 2 ┆ c │ - └─────┴──────┘ - - Divide a column into two categories using uniform quantile probabilities. - - >>> df.with_columns( - ... pl.col("foo") - ... .qcut(2, labels=["low", "high"], left_closed=True) - ... .alias("qcut") - ... ) - shape: (5, 2) - ┌─────┬──────┐ - │ foo ┆ qcut │ - │ --- ┆ --- │ - │ i64 ┆ cat │ - ╞═════╪══════╡ - │ -2 ┆ low │ - │ -1 ┆ low │ - │ 0 ┆ high │ - │ 1 ┆ high │ - │ 2 ┆ high │ - └─────┴──────┘ - - Add both the category and the breakpoint. - - >>> df.with_columns( - ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") - ... ).unnest("qcut") - shape: (5, 3) - ┌─────┬──────┬────────────┐ - │ foo ┆ brk ┆ foo_bin │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪══════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴──────┴────────────┘ - - ''' - def rle(self) -> Self: - ''' - Get the lengths of runs of identical values. - - Returns - ------- - Expr - Expression of data type :class:`Struct` with Fields "lengths" and "values". - - Examples - -------- - >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) - >>> df.select(pl.col("s").rle()).unnest("s") - shape: (6, 2) - ┌─────────┬────────┐ - │ lengths ┆ values │ - │ --- ┆ --- │ - │ i32 ┆ i64 │ - ╞═════════╪════════╡ - │ 2 ┆ 1 │ - │ 1 ┆ 2 │ - │ 1 ┆ 1 │ - │ 1 ┆ null │ - │ 1 ┆ 1 │ - │ 2 ┆ 3 │ - └─────────┴────────┘ - ''' - def rle_id(self) -> Self: - ''' - Map values to run IDs. - - Similar to RLE, but it maps each value to an ID corresponding to the run into - which it falls. This is especially useful when you want to define groups by - runs of identical values rather than the values themselves. - - - Examples - -------- - >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) - >>> # It works on structs of multiple values too! - >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) - shape: (5, 4) - ┌─────┬──────┬─────┬──────┐ - │ a ┆ b ┆ a_r ┆ ab_r │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ u32 ┆ u32 │ - ╞═════╪══════╪═════╪══════╡ - │ 1 ┆ x ┆ 0 ┆ 0 │ - │ 2 ┆ x ┆ 1 ┆ 1 │ - │ 1 ┆ null ┆ 2 ┆ 2 │ - │ 1 ┆ y ┆ 2 ┆ 3 │ - │ 1 ┆ y ┆ 2 ┆ 3 │ - └─────┴──────┴─────┴──────┘ - ''' - def filter(self, predicate: Expr) -> Self: - ''' - Filter a single column. - - The original order of the remaining elements is preserved. - - Mostly useful in an aggregation context. If you want to filter on a DataFrame - level, use `LazyFrame.filter`. - - Parameters - ---------- - predicate - Boolean expression. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group_col": ["g1", "g1", "g2"], - ... "b": [1, 2, 3], - ... } - ... ) - >>> df.group_by("group_col").agg( - ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), - ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), - ... ).sort("group_col") - shape: (2, 3) - ┌───────────┬─────┬─────┐ - │ group_col ┆ lt ┆ gte │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═══════════╪═════╪═════╡ - │ g1 ┆ 1 ┆ 2 │ - │ g2 ┆ 0 ┆ 3 │ - └───────────┴─────┴─────┘ - - ''' - def where(self, predicate: Expr) -> Self: - ''' - Filter a single column. - - Alias for :func:`filter`. - - Parameters - ---------- - predicate - Boolean expression. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group_col": ["g1", "g1", "g2"], - ... "b": [1, 2, 3], - ... } - ... ) - >>> df.group_by("group_col").agg( - ... [ - ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), - ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), - ... ] - ... ).sort("group_col") - shape: (2, 3) - ┌───────────┬─────┬─────┐ - │ group_col ┆ lt ┆ gte │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═══════════╪═════╪═════╡ - │ g1 ┆ 1 ┆ 2 │ - │ g2 ┆ 0 ┆ 3 │ - └───────────┴─────┴─────┘ - - ''' - def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: - ''' - Apply a custom python function to a whole Series or sequence of Series. - - The output of this custom function must be a Series. If you want to apply a - custom function elementwise over single values, see :func:`map_elements`. - A reasonable use case for `map` functions is transforming the values - represented by an expression using a third-party library. - - Read more in `the book - `_. - - Parameters - ---------- - function - Lambda/function to apply. - return_dtype - Dtype of the output Series. - agg_list - Aggregate list. - - Notes - ----- - If you are looking to map a function over a window function or group_by context, - refer to func:`map_elements` instead. - - Warnings - -------- - If `return_dtype` is not provided, this may lead to unexpected results. - We allow this, but it is considered a bug in the user\'s query. - - See Also - -------- - map_elements - replace - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "sine": [0.0, 1.0, 0.0, -1.0], - ... "cosine": [1.0, 0.0, -1.0, 0.0], - ... } - ... ) - >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) - shape: (1, 2) - ┌──────┬────────┐ - │ sine ┆ cosine │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪════════╡ - │ 1 ┆ 0 │ - └──────┴────────┘ - - ''' - def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - ''' - Map a custom/user-defined function (UDF) to each element of a column. - - .. warning:: - This method is much slower than the native expressions API. - Only use it if you cannot implement your logic otherwise. - - The UDF is applied to each element of a column. Note that, in a GroupBy - context, the column will have been pre-aggregated and so each element - will itself be a Series. Therefore, depending on the context, - requirements for `function` differ: - - * Selection - Expects `function` to be of type `Callable[[Any], Any]`. - Applies a Python function to each individual value in the column. - * GroupBy - Expects `function` to be of type `Callable[[Series], Any]`. - For each group, applies a Python function to the slice of the column - corresponding to that group. - - Parameters - ---------- - function - Lambda/function to map. - return_dtype - Dtype of the output Series. - If not set, the dtype will be `pl.Unknown`. - skip_nulls - Don\'t map the function over values that contain nulls (this is faster). - pass_name - Pass the Series name to the custom function (this is more expensive). - strategy : {\'thread_local\', \'threading\'} - This functionality is considered experimental and may be removed/changed. - - - \'thread_local\': run the python function on a single thread. - - \'threading\': run the python function on separate threads. Use with - care as this can slow performance. This might only speed up - your code if the amount of work per element is significant - and the python function releases the GIL (e.g. via calling - a c function) - - Notes - ----- - * Using `map_elements` is strongly discouraged as you will be effectively - running python "for" loops, which will be very slow. Wherever possible you - should prefer the native expression API to achieve the best performance. - - * If your function is expensive and you don\'t want it to be called more than - once for a given input, consider applying an `@lru_cache` decorator to it. - If your data is suitable you may achieve *significant* speedups. - - * Window function application using `over` is considered a GroupBy context - here, so `map_elements` can be used to map functions over window groups. - - Warnings - -------- - If `return_dtype` is not provided, this may lead to unexpected results. - We allow this, but it is considered a bug in the user\'s query. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["a", "b", "c", "c"], - ... } - ... ) - - The function is applied to each element of column `\'a\'`: - - >>> df.with_columns( # doctest: +SKIP - ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), - ... ) - shape: (4, 3) - ┌─────┬─────┬───────────┐ - │ a ┆ b ┆ a_times_2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 │ - ╞═════╪═════╪═══════════╡ - │ 1 ┆ a ┆ 2 │ - │ 2 ┆ b ┆ 4 │ - │ 3 ┆ c ┆ 6 │ - │ 1 ┆ c ┆ 2 │ - └─────┴─────┴───────────┘ - - Tip: it is better to implement this with an expression: - - >>> df.with_columns( - ... (pl.col("a") * 2).alias("a_times_2"), - ... ) # doctest: +IGNORE_RESULT - - In a GroupBy context, each element of the column is itself a Series: - - >>> ( - ... df.lazy().group_by("b").agg(pl.col("a")).collect() - ... ) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬───────────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════╪═══════════╡ - │ a ┆ [1] │ - │ b ┆ [2] │ - │ c ┆ [3, 1] │ - └─────┴───────────┘ - - Therefore, from the user\'s point-of-view, the function is applied per-group: - - >>> ( - ... df.lazy() - ... .group_by("b") - ... .agg(pl.col("a").map_elements(lambda x: x.sum())) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬─────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 1 │ - │ b ┆ 2 │ - │ c ┆ 4 │ - └─────┴─────┘ - - Tip: again, it is better to implement this with an expression: - - >>> ( - ... df.lazy() - ... .group_by("b", maintain_order=True) - ... .agg(pl.col("a").sum()) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - - Window function application using `over` will behave as a GroupBy - context, with your function receiving individual window groups: - - >>> df = pl.DataFrame( - ... { - ... "key": ["x", "x", "y", "x", "y", "z"], - ... "val": [1, 1, 1, 1, 1, 1], - ... } - ... ) - >>> df.with_columns( - ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), - ... ).sort("key") - shape: (6, 3) - ┌─────┬─────┬────────┐ - │ key ┆ val ┆ scaled │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪════════╡ - │ x ┆ 1 ┆ 3 │ - │ x ┆ 1 ┆ 3 │ - │ x ┆ 1 ┆ 3 │ - │ y ┆ 1 ┆ 2 │ - │ y ┆ 1 ┆ 2 │ - │ z ┆ 1 ┆ 1 │ - └─────┴─────┴────────┘ - - Note that this function would *also* be better-implemented natively: - - >>> df.with_columns( - ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), - ... ).sort( - ... "key" - ... ) # doctest: +IGNORE_RESULT - - ''' - def flatten(self) -> Self: - ''' - Flatten a list or string column. - - Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": ["a", "b", "b"], - ... "values": [[1, 2], [2, 3], [4]], - ... } - ... ) - >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ values │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪═══════════╡ - │ a ┆ [1, 2] │ - │ b ┆ [2, 3, 4] │ - └───────┴───────────┘ - - ''' - def explode(self) -> Self: - ''' - Explode a list expression. - - This means that every item is expanded to a new row. - - Returns - ------- - Expr - Expression with the data type of the list elements. - - See Also - -------- - Expr.list.explode : Explode a list column. - Expr.str.explode : Explode a string column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": ["a", "b"], - ... "values": [ - ... [1, 2], - ... [3, 4], - ... ], - ... } - ... ) - >>> df.select(pl.col("values").explode()) - shape: (4, 1) - ┌────────┐ - │ values │ - │ --- │ - │ i64 │ - ╞════════╡ - │ 1 │ - │ 2 │ - │ 3 │ - │ 4 │ - └────────┘ - - ''' - def implode(self) -> Self: - ''' - Aggregate values into a list. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": [4, 5, 6], - ... } - ... ) - >>> df.select(pl.all().implode()) - shape: (1, 2) - ┌───────────┬───────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ list[i64] ┆ list[i64] │ - ╞═══════════╪═══════════╡ - │ [1, 2, 3] ┆ [4, 5, 6] │ - └───────────┴───────────┘ - - ''' - def gather_every(self, n: int) -> Self: - ''' - Take every nth value in the Series and return as a new Series. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) - >>> df.select(pl.col("foo").gather_every(3)) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 4 │ - │ 7 │ - └─────┘ - - ''' - def head(self, n: int | Expr = ...) -> Self: - ''' - Get the first `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.head(3) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - ''' - def tail(self, n: int | Expr = ...) -> Self: - ''' - Get the last `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.tail(3) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 5 │ - │ 6 │ - │ 7 │ - └─────┘ - - ''' - def limit(self, n: int | Expr = ...) -> Self: - ''' - Get the first `n` rows (alias for :func:`Expr.head`). - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.limit(3) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - ''' - def and_(self, *others: Any) -> Self: - ''' - Method equivalent of bitwise "and" operator `expr & other & ...`. - - Parameters - ---------- - *others - One or more integer or boolean expressions to evaluate/combine. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5, 6, 7, 4, 8], - ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], - ... "z": [-9, 2, -1, 4, 8], - ... } - ... ) - >>> df.select( - ... (pl.col("x") >= pl.col("z")) - ... .and_( - ... pl.col("y") >= pl.col("z"), - ... pl.col("y") == pl.col("y"), - ... pl.col("z") <= pl.col("x"), - ... pl.col("y") != pl.col("x"), - ... ) - ... .alias("all") - ... ) - shape: (5, 1) - ┌───────┐ - │ all │ - │ --- │ - │ bool │ - ╞═══════╡ - │ true │ - │ true │ - │ true │ - │ false │ - │ false │ - └───────┘ - - ''' - def or_(self, *others: Any) -> Self: - ''' - Method equivalent of bitwise "or" operator `expr | other | ...`. - - Parameters - ---------- - *others - One or more integer or boolean expressions to evaluate/combine. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5, 6, 7, 4, 8], - ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], - ... "z": [-9, 2, -1, 4, 8], - ... } - ... ) - >>> df.select( - ... (pl.col("x") == pl.col("y")) - ... .or_( - ... pl.col("x") == pl.col("y"), - ... pl.col("y") == pl.col("z"), - ... pl.col("y").cast(int) == pl.col("z"), - ... ) - ... .alias("any") - ... ) - shape: (5, 1) - ┌───────┐ - │ any │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ true │ - │ false │ - │ true │ - │ false │ - └───────┘ - - ''' - def eq(self, other: Any) -> Self: - ''' - Method equivalent of equality operator `expr == other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0], - ... "y": [2.0, 2.0, float("nan"), 4.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").eq(pl.col("y")).alias("x == y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x == y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 1.0 ┆ 2.0 ┆ false │ - │ 2.0 ┆ 2.0 ┆ true │ - │ NaN ┆ NaN ┆ false │ - │ 4.0 ┆ 4.0 ┆ true │ - └─────┴─────┴────────┘ - - ''' - def eq_missing(self, other: Any) -> Self: - ''' - Method equivalent of equality operator `expr == other` where `None == None`. - - This differs from default `eq` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], - ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").eq(pl.col("y")).alias("x eq y"), - ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), - ... ) - shape: (6, 4) - ┌──────┬──────┬────────┬────────────────┐ - │ x ┆ y ┆ x eq y ┆ x eq_missing y │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪══════╪════════╪════════════════╡ - │ 1.0 ┆ 2.0 ┆ false ┆ false │ - │ 2.0 ┆ 2.0 ┆ true ┆ true │ - │ NaN ┆ NaN ┆ false ┆ false │ - │ 4.0 ┆ 4.0 ┆ true ┆ true │ - │ null ┆ 5.0 ┆ null ┆ false │ - │ null ┆ null ┆ null ┆ true │ - └──────┴──────┴────────┴────────────────┘ - - ''' - def ge(self, other: Any) -> Self: - ''' - Method equivalent of "greater than or equal" operator `expr >= other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5.0, 4.0, float("nan"), 2.0], - ... "y": [5.0, 3.0, float("nan"), 1.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").ge(pl.col("y")).alias("x >= y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x >= y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 5.0 ┆ 5.0 ┆ true │ - │ 4.0 ┆ 3.0 ┆ true │ - │ NaN ┆ NaN ┆ false │ - │ 2.0 ┆ 1.0 ┆ true │ - └─────┴─────┴────────┘ - - ''' - def gt(self, other: Any) -> Self: - ''' - Method equivalent of "greater than" operator `expr > other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5.0, 4.0, float("nan"), 2.0], - ... "y": [5.0, 3.0, float("nan"), 1.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").gt(pl.col("y")).alias("x > y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬───────┐ - │ x ┆ y ┆ x > y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪═══════╡ - │ 5.0 ┆ 5.0 ┆ false │ - │ 4.0 ┆ 3.0 ┆ true │ - │ NaN ┆ NaN ┆ false │ - │ 2.0 ┆ 1.0 ┆ true │ - └─────┴─────┴───────┘ - - ''' - def le(self, other: Any) -> Self: - ''' - Method equivalent of "less than or equal" operator `expr <= other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5.0, 4.0, float("nan"), 0.5], - ... "y": [5.0, 3.5, float("nan"), 2.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").le(pl.col("y")).alias("x <= y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x <= y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 5.0 ┆ 5.0 ┆ true │ - │ 4.0 ┆ 3.5 ┆ false │ - │ NaN ┆ NaN ┆ false │ - │ 0.5 ┆ 2.0 ┆ true │ - └─────┴─────┴────────┘ - - ''' - def lt(self, other: Any) -> Self: - ''' - Method equivalent of "less than" operator `expr < other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 3.0], - ... "y": [2.0, 2.0, float("nan"), 4.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").lt(pl.col("y")).alias("x < y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬───────┐ - │ x ┆ y ┆ x < y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪═══════╡ - │ 1.0 ┆ 2.0 ┆ true │ - │ 2.0 ┆ 2.0 ┆ false │ - │ NaN ┆ NaN ┆ false │ - │ 3.0 ┆ 4.0 ┆ true │ - └─────┴─────┴───────┘ - - ''' - def ne(self, other: Any) -> Self: - ''' - Method equivalent of inequality operator `expr != other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0], - ... "y": [2.0, 2.0, float("nan"), 4.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").ne(pl.col("y")).alias("x != y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x != y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 1.0 ┆ 2.0 ┆ true │ - │ 2.0 ┆ 2.0 ┆ false │ - │ NaN ┆ NaN ┆ true │ - │ 4.0 ┆ 4.0 ┆ false │ - └─────┴─────┴────────┘ - - ''' - def ne_missing(self, other: Any) -> Self: - ''' - Method equivalent of equality operator `expr != other` where `None == None`. - - This differs from default `ne` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], - ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").ne(pl.col("y")).alias("x ne y"), - ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), - ... ) - shape: (6, 4) - ┌──────┬──────┬────────┬────────────────┐ - │ x ┆ y ┆ x ne y ┆ x ne_missing y │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪══════╪════════╪════════════════╡ - │ 1.0 ┆ 2.0 ┆ true ┆ true │ - │ 2.0 ┆ 2.0 ┆ false ┆ false │ - │ NaN ┆ NaN ┆ true ┆ true │ - │ 4.0 ┆ 4.0 ┆ false ┆ false │ - │ null ┆ 5.0 ┆ null ┆ true │ - │ null ┆ null ┆ null ┆ false │ - └──────┴──────┴────────┴────────────────┘ - - ''' - def add(self, other: Any) -> Self: - ''' - Method equivalent of addition operator `expr + other`. - - Parameters - ---------- - other - numeric or string value; accepts expression input. - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) - >>> df.with_columns( - ... pl.col("x").add(2).alias("x+int"), - ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), - ... ) - shape: (5, 3) - ┌─────┬───────┬────────┐ - │ x ┆ x+int ┆ x+expr │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═══════╪════════╡ - │ 1 ┆ 3 ┆ 2 │ - │ 2 ┆ 4 ┆ 4 │ - │ 3 ┆ 5 ┆ 9 │ - │ 4 ┆ 6 ┆ 28 │ - │ 5 ┆ 7 ┆ 125 │ - └─────┴───────┴────────┘ - - >>> df = pl.DataFrame( - ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} - ... ) - >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) - shape: (3, 4) - ┌─────┬─────┬─────┬─────┐ - │ x ┆ y ┆ z ┆ xyz │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ str ┆ str │ - ╞═════╪═════╪═════╪═════╡ - │ a ┆ b ┆ c ┆ abc │ - │ d ┆ e ┆ f ┆ def │ - │ g ┆ h ┆ i ┆ ghi │ - └─────┴─────┴─────┴─────┘ - - ''' - def floordiv(self, other: Any) -> Self: - ''' - Method equivalent of integer division operator `expr // other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - See Also - -------- - truediv - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) - >>> df.with_columns( - ... pl.col("x").truediv(2).alias("x/2"), - ... pl.col("x").floordiv(2).alias("x//2"), - ... ) - shape: (5, 3) - ┌─────┬─────┬──────┐ - │ x ┆ x/2 ┆ x//2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ i64 │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 0.5 ┆ 0 │ - │ 2 ┆ 1.0 ┆ 1 │ - │ 3 ┆ 1.5 ┆ 1 │ - │ 4 ┆ 2.0 ┆ 2 │ - │ 5 ┆ 2.5 ┆ 2 │ - └─────┴─────┴──────┘ - - ''' - def mod(self, other: Any) -> Self: - ''' - Method equivalent of modulus operator `expr % other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) - >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) - shape: (5, 2) - ┌─────┬─────┐ - │ x ┆ x%2 │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 0 ┆ 0 │ - │ 1 ┆ 1 │ - │ 2 ┆ 0 │ - │ 3 ┆ 1 │ - │ 4 ┆ 0 │ - └─────┴─────┘ - - ''' - def mul(self, other: Any) -> Self: - ''' - Method equivalent of multiplication operator `expr * other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) - >>> df.with_columns( - ... pl.col("x").mul(2).alias("x*2"), - ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), - ... ) - shape: (5, 3) - ┌─────┬─────┬───────────┐ - │ x ┆ x*2 ┆ x * xlog2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ f64 │ - ╞═════╪═════╪═══════════╡ - │ 1 ┆ 2 ┆ 0.0 │ - │ 2 ┆ 4 ┆ 2.0 │ - │ 4 ┆ 8 ┆ 8.0 │ - │ 8 ┆ 16 ┆ 24.0 │ - │ 16 ┆ 32 ┆ 64.0 │ - └─────┴─────┴───────────┘ - - ''' - def sub(self, other: Any) -> Self: - ''' - Method equivalent of subtraction operator `expr - other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("x").sub(2).alias("x-2"), - ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), - ... ) - shape: (5, 3) - ┌─────┬─────┬────────┐ - │ x ┆ x-2 ┆ x-expr │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪════════╡ - │ 0 ┆ -2 ┆ 0 │ - │ 1 ┆ -1 ┆ 0 │ - │ 2 ┆ 0 ┆ -1 │ - │ 3 ┆ 1 ┆ -3 │ - │ 4 ┆ 2 ┆ -6 │ - └─────┴─────┴────────┘ - - ''' - def truediv(self, other: Any) -> Self: - ''' - Method equivalent of float division operator `expr / other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Notes - ----- - Zero-division behaviour follows IEEE-754: - - 0/0: Invalid operation - mathematically undefined, returns NaN. - n/0: On finite operands gives an exact infinite result, eg: ±infinity. - - See Also - -------- - floordiv - - Examples - -------- - >>> df = pl.DataFrame( - ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} - ... ) - >>> df.with_columns( - ... pl.col("x").truediv(2).alias("x/2"), - ... pl.col("x").truediv(pl.col("y")).alias("x/y"), - ... ) - shape: (5, 4) - ┌─────┬──────┬──────┬───────┐ - │ x ┆ y ┆ x/2 ┆ x/y │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ f64 ┆ f64 │ - ╞═════╪══════╪══════╪═══════╡ - │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ - │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ - │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ - │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ - │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ - └─────┴──────┴──────┴───────┘ - - ''' - def pow(self, exponent: int | float | None | Series | Expr) -> Self: - ''' - Method equivalent of exponentiation operator `expr ** exponent`. - - Parameters - ---------- - exponent - Numeric literal or expression exponent value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) - >>> df.with_columns( - ... pl.col("x").pow(3).alias("cube"), - ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), - ... ) - shape: (4, 3) - ┌─────┬───────┬────────────┐ - │ x ┆ cube ┆ x ** xlog2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ f64 │ - ╞═════╪═══════╪════════════╡ - │ 1 ┆ 1.0 ┆ 1.0 │ - │ 2 ┆ 8.0 ┆ 2.0 │ - │ 4 ┆ 64.0 ┆ 16.0 │ - │ 8 ┆ 512.0 ┆ 512.0 │ - └─────┴───────┴────────────┘ - - ''' - def xor(self, other: Any) -> Self: - ''' - Method equivalent of bitwise exclusive-or operator `expr ^ other`. - - Parameters - ---------- - other - Integer or boolean value; accepts expression input. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"x": [True, False, True, False], "y": [True, True, False, False]} - ... ) - >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) - shape: (4, 3) - ┌───────┬───────┬───────┐ - │ x ┆ y ┆ x ^ y │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞═══════╪═══════╪═══════╡ - │ true ┆ true ┆ false │ - │ false ┆ true ┆ true │ - │ true ┆ false ┆ true │ - │ false ┆ false ┆ false │ - └───────┴───────┴───────┘ - - >>> def binary_string(n: int) -> str: - ... return bin(n)[2:].zfill(8) - >>> - >>> df = pl.DataFrame( - ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, - ... schema={"x": pl.UInt8, "y": pl.UInt8}, - ... ) - >>> df.with_columns( - ... pl.col("x").map_elements(binary_string).alias("bin_x"), - ... pl.col("y").map_elements(binary_string).alias("bin_y"), - ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), - ... pl.col("x") - ... .xor(pl.col("y")) - ... .map_elements(binary_string) - ... .alias("bin_xor_xy"), - ... ) - shape: (4, 6) - ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ - │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ - ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ - │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ - │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ - │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ - │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ - └─────┴─────┴──────────┴──────────┴────────┴────────────┘ - - ''' - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: - ''' - Check if elements of this expression are present in the other Series. - - Parameters - ---------- - other - Series or sequence of primitive type. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} - ... ) - >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) - shape: (3, 3) - ┌───────────┬──────────────────┬──────────┐ - │ sets ┆ optional_members ┆ contains │ - │ --- ┆ --- ┆ --- │ - │ list[i64] ┆ i64 ┆ bool │ - ╞═══════════╪══════════════════╪══════════╡ - │ [1, 2, 3] ┆ 1 ┆ true │ - │ [1, 2] ┆ 2 ┆ true │ - │ [9, 10] ┆ 3 ┆ false │ - └───────────┴──────────────────┴──────────┘ - - ''' - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: - ''' - Repeat the elements in this Series as specified in the given expression. - - The repeated elements are expanded into a `List`. - - Parameters - ---------- - by - Numeric column that determines how often the values will be repeated. - The column will be coerced to UInt32. Give this dtype to make the coercion a - no-op. - - Returns - ------- - Expr - Expression of data type :class:`List`, where the inner data type is equal - to the original data type. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["x", "y", "z"], - ... "n": [1, 2, 3], - ... } - ... ) - >>> df.select(pl.col("a").repeat_by("n")) - shape: (3, 1) - ┌─────────────────┐ - │ a │ - │ --- │ - │ list[str] │ - ╞═════════════════╡ - │ ["x"] │ - │ ["y", "y"] │ - │ ["z", "z", "z"] │ - └─────────────────┘ - - ''' - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: - ''' - Check if this expression is between the given start and end values. - - Parameters - ---------- - lower_bound - Lower bound value. Accepts expression input. Strings are parsed as column - names, other non-expression inputs are parsed as literals. - upper_bound - Upper bound value. Accepts expression input. Strings are parsed as column - names, other non-expression inputs are parsed as literals. - closed : {\'both\', \'left\', \'right\', \'none\'} - Define which sides of the interval are closed (inclusive). - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) - >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) - shape: (5, 2) - ┌─────┬────────────┐ - │ num ┆ is_between │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪════════════╡ - │ 1 ┆ false │ - │ 2 ┆ true │ - │ 3 ┆ true │ - │ 4 ┆ true │ - │ 5 ┆ false │ - └─────┴────────────┘ - - Use the `closed` argument to include or exclude the values at the bounds: - - >>> df.with_columns( - ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") - ... ) - shape: (5, 2) - ┌─────┬────────────┐ - │ num ┆ is_between │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪════════════╡ - │ 1 ┆ false │ - │ 2 ┆ true │ - │ 3 ┆ true │ - │ 4 ┆ false │ - │ 5 ┆ false │ - └─────┴────────────┘ - - You can also use strings as well as numeric/temporal values (note: ensure that - string literals are wrapped with `lit` so as not to conflate them with - column names): - - >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) - >>> df.with_columns( - ... pl.col("a") - ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") - ... .alias("is_between") - ... ) - shape: (5, 2) - ┌─────┬────────────┐ - │ a ┆ is_between │ - │ --- ┆ --- │ - │ str ┆ bool │ - ╞═════╪════════════╡ - │ a ┆ true │ - │ b ┆ true │ - │ c ┆ true │ - │ d ┆ false │ - │ e ┆ false │ - └─────┴────────────┘ - - ''' - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: - ''' - Hash the elements in the selection. - - The hash value is of type `UInt64`. - - Parameters - ---------- - seed - Random seed parameter. Defaults to 0. - seed_1 - Random seed parameter. Defaults to `seed` if not set. - seed_2 - Random seed parameter. Defaults to `seed` if not set. - seed_3 - Random seed parameter. Defaults to `seed` if not set. - - Notes - ----- - This implementation of :func:`rows` does not guarantee stable results - across different Polars versions. Its stability is only guaranteed within a - single version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": ["x", None, "z"], - ... } - ... ) - >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌──────────────────────┬──────────────────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u64 ┆ u64 │ - ╞══════════════════════╪══════════════════════╡ - │ 9774092659964970114 ┆ 13614470193936745724 │ - │ 1101441246220388612 ┆ 11638928888656214026 │ - │ 11638928888656214026 ┆ 13382926553367784577 │ - └──────────────────────┴──────────────────────┘ - - ''' - def reinterpret(self) -> Self: - ''' - Reinterpret the underlying bits as a signed/unsigned integer. - - This operation is only allowed for 64bit integers. For lower bits integers, - you can safely use that cast operation. - - Parameters - ---------- - signed - If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. - - Examples - -------- - >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) - >>> df = pl.DataFrame([s]) - >>> df.select( - ... [ - ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), - ... pl.col("a").alias("original"), - ... ] - ... ) - shape: (3, 2) - ┌───────────────┬──────────┐ - │ reinterpreted ┆ original │ - │ --- ┆ --- │ - │ i64 ┆ u64 │ - ╞═══════════════╪══════════╡ - │ 1 ┆ 1 │ - │ 1 ┆ 1 │ - │ 2 ┆ 2 │ - └───────────────┴──────────┘ - - ''' - def inspect(self, fmt: str = ...) -> Self: - ''' - Print the value that this expression evaluates to and pass on the value. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 1, 2]}) - >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) - value is: shape: (3,) - Series: \'foo\' [i64] - [ - 1 - 2 - 4 - ] - shape: (3, 1) - ┌─────┐ - │ bar │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 4 │ - └─────┘ - - ''' - def interpolate(self, method: InterpolationMethod = ...) -> Self: - ''' - Fill null values using interpolation. - - Parameters - ---------- - method : {\'linear\', \'nearest\'} - Interpolation method. - - Examples - -------- - Fill null values using linear interpolation. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, None, 3], - ... "b": [1.0, float("nan"), 3.0], - ... } - ... ) - >>> df.select(pl.all().interpolate()) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 1.0 ┆ 1.0 │ - │ 2.0 ┆ NaN │ - │ 3.0 ┆ 3.0 │ - └─────┴─────┘ - - Fill null values using nearest interpolation. - - >>> df.select(pl.all().interpolate("nearest")) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪═════╡ - │ 1 ┆ 1.0 │ - │ 3 ┆ NaN │ - │ 3 ┆ 3.0 │ - └─────┴─────┘ - - Regrid data to a new grid. - - >>> df_original_grid = pl.DataFrame( - ... { - ... "grid_points": [1, 3, 10], - ... "values": [2.0, 6.0, 20.0], - ... } - ... ) # Interpolate from this to the new grid - >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) - >>> df_new_grid.join( - ... df_original_grid, on="grid_points", how="left" - ... ).with_columns(pl.col("values").interpolate()) - shape: (10, 2) - ┌─────────────┬────────┐ - │ grid_points ┆ values │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════════════╪════════╡ - │ 1 ┆ 2.0 │ - │ 2 ┆ 4.0 │ - │ 3 ┆ 6.0 │ - │ 4 ┆ 8.0 │ - │ … ┆ … │ - │ 7 ┆ 14.0 │ - │ 8 ┆ 16.0 │ - │ 9 ┆ 18.0 │ - │ 10 ┆ 20.0 │ - └─────────────┴────────┘ - - ''' - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling min (moving min) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their min. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic - temporal size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_min=pl.col("A").rolling_min(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_min │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 2.0 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ 4.0 │ - │ 6.0 ┆ 5.0 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_min=pl.col("A").rolling_min( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_min │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.25 │ - │ 3.0 ┆ 0.5 │ - │ 4.0 ┆ 0.75 │ - │ 5.0 ┆ 1.0 │ - │ 6.0 ┆ 1.25 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_min │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 2.0 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ 4.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - >>> df_temporal.with_columns( - ... rolling_row_min=pl.col("row_nr").rolling_min( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_min │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling max (moving max) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their max. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_max=pl.col("A").rolling_max(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_max │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 2.0 │ - │ 3.0 ┆ 3.0 │ - │ 4.0 ┆ 4.0 │ - │ 5.0 ┆ 5.0 │ - │ 6.0 ┆ 6.0 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_max=pl.col("A").rolling_max( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_max │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.25 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ 3.75 │ - │ 6.0 ┆ 4.5 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_max │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 3.0 │ - │ 3.0 ┆ 4.0 │ - │ 4.0 ┆ 5.0 │ - │ 5.0 ┆ 6.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling max with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_max=pl.col("row_nr").rolling_max( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_max │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling max with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_max=pl.col("row_nr").rolling_max( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_max │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling mean (moving mean) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their mean. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_mean=pl.col("A").rolling_mean(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬──────────────┐ - │ A ┆ rolling_mean │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.5 │ - │ 4.0 ┆ 3.5 │ - │ 5.0 ┆ 4.5 │ - │ 6.0 ┆ 5.5 │ - └─────┴──────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_mean=pl.col("A").rolling_mean( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────┐ - │ A ┆ rolling_mean │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.75 │ - │ 3.0 ┆ 2.75 │ - │ 4.0 ┆ 3.75 │ - │ 5.0 ┆ 4.75 │ - │ 6.0 ┆ 5.75 │ - └─────┴──────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬──────────────┐ - │ A ┆ rolling_mean │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 2.0 │ - │ 3.0 ┆ 3.0 │ - │ 4.0 ┆ 4.0 │ - │ 5.0 ┆ 5.0 │ - │ 6.0 ┆ null │ - └─────┴──────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling mean with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_mean=pl.col("row_nr").rolling_mean( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬──────────────────┐ - │ row_nr ┆ date ┆ rolling_row_mean │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪══════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ - └────────┴─────────────────────┴──────────────────┘ - - Compute the rolling mean with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_mean=pl.col("row_nr").rolling_mean( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬──────────────────┐ - │ row_nr ┆ date ┆ rolling_row_mean │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪══════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ - └────────┴─────────────────────┴──────────────────┘ - - ''' - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling sum (moving sum) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their sum. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - of dtype `{Date, Datetime}` - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_sum=pl.col("A").rolling_sum(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_sum │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 3.0 │ - │ 3.0 ┆ 5.0 │ - │ 4.0 ┆ 7.0 │ - │ 5.0 ┆ 9.0 │ - │ 6.0 ┆ 11.0 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_sum=pl.col("A").rolling_sum( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_sum │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.75 │ - │ 3.0 ┆ 2.75 │ - │ 4.0 ┆ 3.75 │ - │ 5.0 ┆ 4.75 │ - │ 6.0 ┆ 5.75 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_sum │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 6.0 │ - │ 3.0 ┆ 9.0 │ - │ 4.0 ┆ 12.0 │ - │ 5.0 ┆ 15.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling sum with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_sum=pl.col("row_nr").rolling_sum( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_sum │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling sum with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_sum=pl.col("row_nr").rolling_sum( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_sum │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling standard deviation. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` means - the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_std=pl.col("A").rolling_std(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_std │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.707107 │ - │ 3.0 ┆ 0.707107 │ - │ 4.0 ┆ 0.707107 │ - │ 5.0 ┆ 0.707107 │ - │ 6.0 ┆ 0.707107 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_std=pl.col("A").rolling_std( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_std │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.433013 │ - │ 3.0 ┆ 0.433013 │ - │ 4.0 ┆ 0.433013 │ - │ 5.0 ┆ 0.433013 │ - │ 6.0 ┆ 0.433013 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_std │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 1.0 │ - │ 4.0 ┆ 1.0 │ - │ 5.0 ┆ 1.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling std with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_std=pl.col("row_nr").rolling_std( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_std │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling std with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_std=pl.col("row_nr").rolling_std( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_std │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling variance. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_var=pl.col("A").rolling_var(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_var │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.5 │ - │ 3.0 ┆ 0.5 │ - │ 4.0 ┆ 0.5 │ - │ 5.0 ┆ 0.5 │ - │ 6.0 ┆ 0.5 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_var=pl.col("A").rolling_var( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_var │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.1875 │ - │ 3.0 ┆ 0.1875 │ - │ 4.0 ┆ 0.1875 │ - │ 5.0 ┆ 0.1875 │ - │ 6.0 ┆ 0.1875 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_var │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 1.0 │ - │ 4.0 ┆ 1.0 │ - │ 5.0 ┆ 1.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling var with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_var=pl.col("row_nr").rolling_var( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_var │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling var with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_var=pl.col("row_nr").rolling_var( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_var │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling median. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` means - the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_median=pl.col("A").rolling_median(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬────────────────┐ - │ A ┆ rolling_median │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.5 │ - │ 4.0 ┆ 3.5 │ - │ 5.0 ┆ 4.5 │ - │ 6.0 ┆ 5.5 │ - └─────┴────────────────┘ - - Specify weights for the values in each window: - - >>> df.with_columns( - ... rolling_median=pl.col("A").rolling_median( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬────────────────┐ - │ A ┆ rolling_median │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.5 │ - │ 4.0 ┆ 3.5 │ - │ 5.0 ┆ 4.5 │ - │ 6.0 ┆ 5.5 │ - └─────┴────────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬────────────────┐ - │ A ┆ rolling_median │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 2.0 │ - │ 3.0 ┆ 3.0 │ - │ 4.0 ┆ 4.0 │ - │ 5.0 ┆ 5.0 │ - │ 6.0 ┆ null │ - └─────┴────────────────┘ - - ''' - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling quantile. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - window_size - The length of the window. Can be a fixed integer size, or a dynamic - temporal size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.25, window_size=4 - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ null │ - │ 4.0 ┆ 2.0 │ - │ 5.0 ┆ 3.0 │ - │ 6.0 ┆ 4.0 │ - └─────┴──────────────────┘ - - Specify weights for the values in each window: - - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ null │ - │ 4.0 ┆ 2.0 │ - │ 5.0 ┆ 3.0 │ - │ 6.0 ┆ 4.0 │ - └─────┴──────────────────┘ - - Specify weights and interpolation method - - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.25, - ... window_size=4, - ... weights=[0.2, 0.4, 0.4, 0.2], - ... interpolation="linear", - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ null │ - │ 4.0 ┆ 1.625 │ - │ 5.0 ┆ 2.625 │ - │ 6.0 ┆ 3.625 │ - └─────┴──────────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.2, window_size=5, center=True - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ 2.0 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ null │ - │ 6.0 ┆ null │ - └─────┴──────────────────┘ - - ''' - def rolling_skew(self, window_size: int) -> Self: - ''' - Compute a rolling skew. - - The window at a given row includes the row itself and the - `window_size - 1` elements before it. - - Parameters - ---------- - window_size - Integer size of the rolling window. - bias - If False, the calculations are corrected for statistical bias. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) - >>> df.select(pl.col("a").rolling_skew(3)) - shape: (4, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ null │ - │ null │ - │ 0.381802 │ - │ 0.47033 │ - └──────────┘ - - Note how the values match the following: - - >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() - (0.38180177416060584, 0.47033046033698594) - - ''' - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a custom rolling window function. - - .. warning:: - Computing custom functions is extremely slow. Use specialized rolling - functions such as :func:`Expr.rolling_sum` if at all possible. - - Parameters - ---------- - function - Custom aggregation function. - window_size - Size of the window. The window at a given row will include the row - itself and the `window_size - 1` elements before it. - weights - A list of weights with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window. - - Examples - -------- - >>> from numpy import nansum - >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) - >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) - shape: (5, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ null │ - │ null │ - │ 22.0 │ - │ 11.0 │ - │ 17.0 │ - └──────┘ - - ''' - def abs(self) -> Self: - ''' - Compute absolute values. - - Same as `abs(expr)`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [-1.0, 0.0, 1.0, 2.0], - ... } - ... ) - >>> df.select(pl.col("A").abs()) - shape: (4, 1) - ┌─────┐ - │ A │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - │ 0.0 │ - │ 1.0 │ - │ 2.0 │ - └─────┘ - - ''' - def rank(self, method: RankMethod = ...) -> Self: - ''' - Assign ranks to data, dealing with ties appropriately. - - Parameters - ---------- - method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} - The method used to assign ranks to tied elements. - The following methods are available (default is \'average\'): - - - \'average\' : The average of the ranks that would have been assigned to - all the tied values is assigned to each value. - - \'min\' : The minimum of the ranks that would have been assigned to all - the tied values is assigned to each value. (This is also referred to - as "competition" ranking.) - - \'max\' : The maximum of the ranks that would have been assigned to all - the tied values is assigned to each value. - - \'dense\' : Like \'min\', but the rank of the next highest element is - assigned the rank immediately after those assigned to the tied - elements. - - \'ordinal\' : All values are given a distinct rank, corresponding to - the order that the values occur in the Series. - - \'random\' : Like \'ordinal\', but the rank for ties is not dependent - on the order that the values occur in the Series. - descending - Rank in descending order. - seed - If `method="random"`, use this as seed. - - Examples - -------- - The \'average\' method: - - >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) - >>> df.select(pl.col("a").rank()) - shape: (5, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 3.0 │ - │ 4.5 │ - │ 1.5 │ - │ 1.5 │ - │ 4.5 │ - └─────┘ - - The \'ordinal\' method: - - >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) - >>> df.select(pl.col("a").rank("ordinal")) - shape: (5, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 3 │ - │ 4 │ - │ 1 │ - │ 2 │ - │ 5 │ - └─────┘ - - Use \'rank\' with \'over\' to rank within groups: - - >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) - >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) - shape: (5, 3) - ┌─────┬─────┬──────┐ - │ a ┆ b ┆ rank │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ f64 │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 6 ┆ 1.0 │ - │ 1 ┆ 7 ┆ 2.0 │ - │ 2 ┆ 5 ┆ 1.0 │ - │ 2 ┆ 14 ┆ 3.0 │ - │ 2 ┆ 11 ┆ 2.0 │ - └─────┴─────┴──────┘ - - ''' - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: - ''' - Calculate the first discrete difference between shifted items. - - Parameters - ---------- - n - Number of slots to shift. - null_behavior : {\'ignore\', \'drop\'} - How to handle null values. - - Examples - -------- - >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) - >>> df.with_columns(change=pl.col("int").diff()) - shape: (5, 2) - ┌─────┬────────┐ - │ int ┆ change │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪════════╡ - │ 20 ┆ null │ - │ 10 ┆ -10 │ - │ 30 ┆ 20 │ - │ 25 ┆ -5 │ - │ 35 ┆ 10 │ - └─────┴────────┘ - - >>> df.with_columns(change=pl.col("int").diff(n=2)) - shape: (5, 2) - ┌─────┬────────┐ - │ int ┆ change │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪════════╡ - │ 20 ┆ null │ - │ 10 ┆ null │ - │ 30 ┆ 10 │ - │ 25 ┆ 15 │ - │ 35 ┆ 5 │ - └─────┴────────┘ - - >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) - shape: (3, 1) - ┌──────┐ - │ diff │ - │ --- │ - │ i64 │ - ╞══════╡ - │ 10 │ - │ 15 │ - │ 5 │ - └──────┘ - - ''' - def pct_change(self, n: int | IntoExprColumn = ...) -> Self: - ''' - Computes percentage change between values. - - Percentage change (as fraction) between current element and most-recent - non-null element at least `n` period(s) before the current element. - - Computes the change from the previous row by default. - - Parameters - ---------- - n - periods to shift for forming percent change. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [10, 11, 12, None, 12], - ... } - ... ) - >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) - shape: (5, 2) - ┌──────┬────────────┐ - │ a ┆ pct_change │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞══════╪════════════╡ - │ 10 ┆ null │ - │ 11 ┆ 0.1 │ - │ 12 ┆ 0.090909 │ - │ null ┆ 0.0 │ - │ 12 ┆ 0.0 │ - └──────┴────────────┘ - - ''' - def skew(self) -> Self: - ''' - Compute the sample skewness of a data set. - - For normally distributed data, the skewness should be about zero. For - unimodal continuous distributions, a skewness value greater than zero means - that there is more weight in the right tail of the distribution. The - function `skewtest` can be used to determine if the skewness value - is close enough to zero, statistically speaking. - - - See scipy.stats for more information. - - Parameters - ---------- - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - Notes - ----- - The sample skewness is computed as the Fisher-Pearson coefficient - of skewness, i.e. - - .. math:: g_1=\\frac{m_3}{m_2^{3/2}} - - where - - .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i - - is the biased sample :math:`i\\texttt{th}` central moment, and - :math:`\\bar{x}` is - the sample mean. If `bias` is False, the calculations are - corrected for bias and the value computed is the adjusted - Fisher-Pearson standardized moment coefficient, i.e. - - .. math:: - G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").skew()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.343622 │ - └──────────┘ - - ''' - def kurtosis(self) -> Self: - ''' - Compute the kurtosis (Fisher or Pearson) of a dataset. - - Kurtosis is the fourth central moment divided by the square of the - variance. If Fisher\'s definition is used, then 3.0 is subtracted from - the result to give 0.0 for a normal distribution. - If bias is False then the kurtosis is calculated using k statistics to - eliminate bias coming from biased moment estimators. - - See scipy.stats for more information - - Parameters - ---------- - fisher : bool, optional - If True, Fisher\'s definition is used (normal ==> 0.0). If False, - Pearson\'s definition is used (normal ==> 3.0). - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").kurtosis()) - shape: (1, 1) - ┌───────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═══════════╡ - │ -1.153061 │ - └───────────┘ - - ''' - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: - ''' - Set values outside the given boundaries to the boundary value. - - Parameters - ---------- - lower_bound - Lower bound. Accepts expression input. - Non-expression inputs are parsed as literals. - upper_bound - Upper bound. Accepts expression input. - Non-expression inputs are parsed as literals. - - See Also - -------- - when - - Notes - ----- - This method only works for numeric and temporal columns. To clip other data - types, consider writing a `when-then-otherwise` expression. See :func:`when`. - - Examples - -------- - Specifying both a lower and upper bound: - - >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) - >>> df.with_columns(clip=pl.col("a").clip(1, 10)) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ clip │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ -50 ┆ 1 │ - │ 5 ┆ 5 │ - │ 50 ┆ 10 │ - │ null ┆ null │ - └──────┴──────┘ - - Specifying only a single bound: - - >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ clip │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ -50 ┆ -50 │ - │ 5 ┆ 5 │ - │ 50 ┆ 10 │ - │ null ┆ null │ - └──────┴──────┘ - - ''' - def lower_bound(self) -> Self: - ''' - Calculate the lower bound. - - Returns a unit Series with the lowest value possible for the dtype of this - expression. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").lower_bound()) - shape: (1, 1) - ┌──────────────────────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════════════════════╡ - │ -9223372036854775808 │ - └──────────────────────┘ - - ''' - def upper_bound(self) -> Self: - ''' - Calculate the upper bound. - - Returns a unit Series with the highest value possible for the dtype of this - expression. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").upper_bound()) - shape: (1, 1) - ┌─────────────────────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════════════════════╡ - │ 9223372036854775807 │ - └─────────────────────┘ - - ''' - def sign(self) -> Self: - ''' - Compute the element-wise indication of the sign. - - The returned values can be -1, 0, or 1: - - * -1 if x < 0. - * 0 if x == 0. - * 1 if x > 0. - - (null values are preserved as-is). - - Examples - -------- - >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) - >>> df.select(pl.col("a").sign()) - shape: (5, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ -1 │ - │ 0 │ - │ 0 │ - │ 1 │ - │ null │ - └──────┘ - - ''' - def sin(self) -> Self: - ''' - Compute the element-wise value for the sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.0]}) - >>> df.select(pl.col("a").sin()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def cos(self) -> Self: - ''' - Compute the element-wise value for the cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.0]}) - >>> df.select(pl.col("a").cos()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def tan(self) -> Self: - ''' - Compute the element-wise value for the tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").tan().round(2)) - shape: (1, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ 1.56 │ - └──────┘ - - ''' - def cot(self) -> Self: - ''' - Compute the element-wise value for the cotangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").cot().round(2)) - shape: (1, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ 0.64 │ - └──────┘ - - ''' - def arcsin(self) -> Self: - ''' - Compute the element-wise value for the inverse sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arcsin()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.570796 │ - └──────────┘ - - ''' - def arccos(self) -> Self: - ''' - Compute the element-wise value for the inverse cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.0]}) - >>> df.select(pl.col("a").arccos()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.570796 │ - └──────────┘ - - ''' - def arctan(self) -> Self: - ''' - Compute the element-wise value for the inverse tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arctan()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.785398 │ - └──────────┘ - - ''' - def sinh(self) -> Self: - ''' - Compute the element-wise value for the hyperbolic sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").sinh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.175201 │ - └──────────┘ - - ''' - def cosh(self) -> Self: - ''' - Compute the element-wise value for the hyperbolic cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").cosh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.543081 │ - └──────────┘ - - ''' - def tanh(self) -> Self: - ''' - Compute the element-wise value for the hyperbolic tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").tanh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.761594 │ - └──────────┘ - - ''' - def arcsinh(self) -> Self: - ''' - Compute the element-wise value for the inverse hyperbolic sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arcsinh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.881374 │ - └──────────┘ - - ''' - def arccosh(self) -> Self: - ''' - Compute the element-wise value for the inverse hyperbolic cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arccosh()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def arctanh(self) -> Self: - ''' - Compute the element-wise value for the inverse hyperbolic tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arctanh()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ inf │ - └─────┘ - - ''' - def degrees(self) -> Self: - ''' - Convert from radians to degrees. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> import math - >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) - >>> df.select(pl.col("a").degrees()) - shape: (9, 1) - ┌────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞════════╡ - │ -720.0 │ - │ -540.0 │ - │ -360.0 │ - │ -180.0 │ - │ 0.0 │ - │ 180.0 │ - │ 360.0 │ - │ 540.0 │ - │ 720.0 │ - └────────┘ - ''' - def radians(self) -> Self: - ''' - Convert from degrees to radians. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) - >>> df.select(pl.col("a").radians()) - shape: (9, 1) - ┌────────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞════════════╡ - │ -12.566371 │ - │ -9.424778 │ - │ -6.283185 │ - │ -3.141593 │ - │ 0.0 │ - │ 3.141593 │ - │ 6.283185 │ - │ 9.424778 │ - │ 12.566371 │ - └────────────┘ - ''' - def reshape(self, dimensions: tuple[int, ...]) -> Self: - ''' - Reshape this Expr to a flat Series or a Series of Lists. - - Parameters - ---------- - dimensions - Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that - dimension is inferred. - - Returns - ------- - Expr - If a single dimension is given, results in an expression of the original - data type. - If a multiple dimensions are given, results in an expression of data type - :class:`List` with shape (rows, cols). - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) - >>> df.select(pl.col("foo").reshape((3, 3))) - shape: (3, 1) - ┌───────────┐ - │ foo │ - │ --- │ - │ list[i64] │ - ╞═══════════╡ - │ [1, 2, 3] │ - │ [4, 5, 6] │ - │ [7, 8, 9] │ - └───────────┘ - - See Also - -------- - Expr.list.explode : Explode a list column. - - ''' - def shuffle(self, seed: int | None = ...) -> Self: - ''' - Shuffle the contents of this expression. - - Parameters - ---------- - seed - Seed for the random number generator. If set to None (default), a - random seed is generated each time the shuffle is called. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").shuffle(seed=1)) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - │ 1 │ - │ 3 │ - └─────┘ - - ''' - def sample(self, n: int | IntoExprColumn | None = ...) -> Self: - ''' - Sample from this expression. - - Parameters - ---------- - n - Number of items to return. Cannot be used with `fraction`. Defaults to 1 if - `fraction` is None. - fraction - Fraction of items to return. Cannot be used with `n`. - with_replacement - Allow values to be sampled more than once. - shuffle - Shuffle the order of sampled data points. - seed - Seed for the random number generator. If set to None (default), a - random seed is generated for each sample operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 3 │ - │ 1 │ - │ 1 │ - └─────┘ - - ''' - def ewm_mean(self) -> Self: - ''' - Exponentially-weighted moving average. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").ewm_mean(com=1)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.0 │ - │ 1.666667 │ - │ 2.428571 │ - └──────────┘ - - ''' - def ewm_std(self) -> Self: - ''' - Exponentially-weighted moving standard deviation. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").ewm_std(com=1)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.0 │ - │ 0.707107 │ - │ 0.963624 │ - └──────────┘ - - ''' - def ewm_var(self) -> Self: - ''' - Exponentially-weighted moving variance. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").ewm_var(com=1)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.0 │ - │ 0.5 │ - │ 0.928571 │ - └──────────┘ - - ''' - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: - ''' - Extremely fast method for extending the Series with \'n\' copies of a value. - - Parameters - ---------- - value - A constant literal value (not an expression) with which to extend the - expression result Series; can pass None to extend with nulls. - n - The number of additional values that will be added. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1, 2, 3]}) - >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) - shape: (5, 1) - ┌────────┐ - │ values │ - │ --- │ - │ i64 │ - ╞════════╡ - │ 0 │ - │ 1 │ - │ 2 │ - │ 99 │ - │ 99 │ - └────────┘ - - ''' - def value_counts(self) -> Self: - ''' - Count the occurrences of unique values. - - Parameters - ---------- - sort - Sort the output by count in descending order. - If set to `False` (default), the order of the output is random. - parallel - Execute the computation in parallel. - - .. note:: - This option should likely not be enabled in a group by context, - as the computation is already parallelized per group. - - Returns - ------- - Expr - Expression of data type :class:`Struct` with mapping of unique values to - their count. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} - ... ) - >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT - shape: (3, 1) - ┌─────────────┐ - │ color │ - │ --- │ - │ struct[2] │ - ╞═════════════╡ - │ {"red",2} │ - │ {"green",1} │ - │ {"blue",3} │ - └─────────────┘ - - Sort the output by count. - - >>> df.select(pl.col("color").value_counts(sort=True)) - shape: (3, 1) - ┌─────────────┐ - │ color │ - │ --- │ - │ struct[2] │ - ╞═════════════╡ - │ {"blue",3} │ - │ {"red",2} │ - │ {"green",1} │ - └─────────────┘ - - ''' - def unique_counts(self) -> Self: - ''' - Return a count of the unique values in the order of appearance. - - This method differs from `value_counts` in that it does not return the - values, only the counts and might be faster - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "id": ["a", "b", "b", "c", "c", "c"], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("id").unique_counts(), - ... ] - ... ) - shape: (3, 1) - ┌─────┐ - │ id │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - ''' - def log(self, base: float = ...) -> Self: - ''' - Compute the logarithm to a given base. - - Parameters - ---------- - base - Given base, defaults to `e` - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").log(base=2)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.0 │ - │ 1.0 │ - │ 1.584963 │ - └──────────┘ - - ''' - def log1p(self) -> Self: - ''' - Compute the natural logarithm of each element plus one. - - This computes `log(1 + x)` but is more numerically stable for `x` close to zero. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").log1p()) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.693147 │ - │ 1.098612 │ - │ 1.386294 │ - └──────────┘ - - ''' - def entropy(self, base: float = ...) -> Self: - ''' - Computes the entropy. - - Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. - - Parameters - ---------- - base - Given base, defaults to `e` - normalize - Normalize pk if it doesn\'t sum to 1. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").entropy(base=2)) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.459148 │ - └──────────┘ - >>> df.select(pl.col("a").entropy(base=2, normalize=False)) - shape: (1, 1) - ┌───────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═══════════╡ - │ -6.754888 │ - └───────────┘ - - ''' - def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: - ''' - Run an expression over a sliding window that increases `1` slot every iteration. - - Parameters - ---------- - expr - Expression to evaluate - min_periods - Number of valid values there should be in the window before the expression - is evaluated. valid values = `length - null_count` - parallel - Run in parallel. Don\'t do this in a group by or another operation that - already has much parallelization. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - This can be really slow as it can have `O(n^2)` complexity. Don\'t use this - for operations that visit all elements. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) - >>> df.select( - ... [ - ... pl.col("values").cumulative_eval( - ... pl.element().first() - pl.element().last() ** 2 - ... ) - ... ] - ... ) - shape: (5, 1) - ┌────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞════════╡ - │ 0.0 │ - │ -3.0 │ - │ -8.0 │ - │ -15.0 │ - │ -24.0 │ - └────────┘ - - ''' - def set_sorted(self) -> Self: - ''' - Flags the expression as \'sorted\'. - - Enables downstream code to user fast paths for sorted arrays. - - Parameters - ---------- - descending - Whether the `Series` order is descending. - - Warnings - -------- - This can lead to incorrect results if this `Series` is not sorted!! - Use with care! - - Examples - -------- - >>> df = pl.DataFrame({"values": [1, 2, 3]}) - >>> df.select(pl.col("values").set_sorted().max()) - shape: (1, 1) - ┌────────┐ - │ values │ - │ --- │ - │ i64 │ - ╞════════╡ - │ 3 │ - └────────┘ - - ''' - def shrink_dtype(self) -> Self: - ''' - Shrink numeric columns to the minimal required datatype. - - Shrink to the dtype needed to fit the extrema of this [`Series`]. - This can be used to reduce memory pressure. - - Examples - -------- - >>> pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": [1, 2, 2 << 32], - ... "c": [-1, 2, 1 << 30], - ... "d": [-112, 2, 112], - ... "e": [-112, 2, 129], - ... "f": ["a", "b", "c"], - ... "g": [0.1, 1.32, 0.12], - ... "h": [True, None, False], - ... } - ... ).select(pl.all().shrink_dtype()) - shape: (3, 8) - ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ - ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ - │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ - │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ - │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ - └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ - - ''' - def cache(self) -> Self: - """ - Cache this expression so that it only is executed once per context. - - .. deprecated:: 0.18.9 - This method now does nothing. It has been superseded by the - `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically - caches expressions that are equal. - - """ - def replace(self, mapping: dict[Any, Any]) -> Self: - ''' - Replace values according to the given mapping. - - Needs a global string cache for lazily evaluated queries on columns of - type `Categorical`. - - Parameters - ---------- - mapping - Mapping of values to their replacement. - default - Value to use when the mapping does not contain the lookup value. - Defaults to keeping the original value. Accepts expression input. - Non-expression inputs are parsed as literals. - return_dtype - Set return dtype to override automatic return dtype determination. - - See Also - -------- - str.replace - - Examples - -------- - Replace a single value by another value. Values not in the mapping remain - unchanged. - - >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) - >>> df.with_columns(pl.col("a").replace({2: 100}).alias("replaced")) - shape: (4, 2) - ┌─────┬──────────┐ - │ a ┆ replaced │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════════╡ - │ 1 ┆ 1 │ - │ 2 ┆ 100 │ - │ 2 ┆ 100 │ - │ 3 ┆ 3 │ - └─────┴──────────┘ - - Replace multiple values. Specify a default to set values not in the given map - to the default value. - - >>> df = pl.DataFrame({"country_code": ["FR", "ES", "DE", None]}) - >>> country_code_map = { - ... "CA": "Canada", - ... "DE": "Germany", - ... "FR": "France", - ... None: "unspecified", - ... } - >>> df.with_columns( - ... pl.col("country_code") - ... .replace(country_code_map, default=None) - ... .alias("replaced") - ... ) - shape: (4, 2) - ┌──────────────┬─────────────┐ - │ country_code ┆ replaced │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞══════════════╪═════════════╡ - │ FR ┆ France │ - │ ES ┆ null │ - │ DE ┆ Germany │ - │ null ┆ unspecified │ - └──────────────┴─────────────┘ - - The return type can be overridden with the `return_dtype` argument. - - >>> df = df.with_row_count() - >>> df.select( - ... "row_nr", - ... pl.col("row_nr") - ... .replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) - ... .alias("replaced"), - ... ) - shape: (4, 2) - ┌────────┬──────────┐ - │ row_nr ┆ replaced │ - │ --- ┆ --- │ - │ u32 ┆ u8 │ - ╞════════╪══════════╡ - │ 0 ┆ 0 │ - │ 1 ┆ 10 │ - │ 2 ┆ 20 │ - │ 3 ┆ 0 │ - └────────┴──────────┘ - - To reference other columns as a `default` value, a struct column must be - constructed first. The first field must be the column in which values are - replaced. The other columns can be used in the default expression. - - >>> df.with_columns( - ... pl.struct("country_code", "row_nr") - ... .replace( - ... mapping=country_code_map, - ... default=pl.col("row_nr").cast(pl.Utf8), - ... ) - ... .alias("replaced") - ... ) - shape: (4, 3) - ┌────────┬──────────────┬─────────────┐ - │ row_nr ┆ country_code ┆ replaced │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ str ┆ str │ - ╞════════╪══════════════╪═════════════╡ - │ 0 ┆ FR ┆ France │ - │ 1 ┆ ES ┆ 1 │ - │ 2 ┆ DE ┆ Germany │ - │ 3 ┆ null ┆ unspecified │ - └────────┴──────────────┴─────────────┘ - ''' - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: - """ - Apply a custom python function to a Series or sequence of Series. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Expr.map_batches`. - - Parameters - ---------- - function - Lambda/ function to apply. - return_dtype - Dtype of the output Series. - agg_list - Aggregate list - - """ - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - """ - Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Expr.map_elements`. - - Parameters - ---------- - function - Lambda/ function to apply. - return_dtype - Dtype of the output Series. - If not set, the dtype will be - `polars.Unknown`. - skip_nulls - Don't apply the function over values - that contain nulls. This is faster. - pass_name - Pass the Series name to the custom function - This is more expensive. - strategy : {'thread_local', 'threading'} - This functionality is in `alpha` stage. This may be removed - /changed without it being considered a breaking change. - - - 'thread_local': run the python function on a single thread. - - 'threading': run the python function on separate threads. Use with - care as this can slow performance. This might only speed up - your code if the amount of work per element is significant - and the python function releases the GIL (e.g. via calling - a c function) - - """ - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - """ - Apply a custom rolling window function. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Expr.rolling_map`. - - Parameters - ---------- - function - Aggregation function - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - """ - def is_first(self) -> Self: - """ - Return a boolean mask indicating the first occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Expr.is_first_distinct`. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - """ - def is_last(self) -> Self: - """ - Return a boolean mask indicating the last occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Expr.is_last_distinct`. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - """ - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: - """ - Clip (limit) the values in an array to a `min` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - lower_bound - Lower bound. - - """ - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: - """ - Clip (limit) the values in an array to a `max` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - upper_bound - Upper bound. - - """ - def shift_and_fill(self, fill_value: IntoExpr) -> Self: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - Fill None values with the result of this expression. - n - Number of places to shift (may be negative). - - """ - def register_plugin(self) -> Self: - """ - Register a shared library as a plugin. - - .. warning:: - This is highly unsafe as this will call the C function - loaded by `lib::symbol`. - - The parameters you give dictate how polars will deal - with the function. Make sure they are correct! - - .. note:: - This functionality is unstable and may change without it - being considered breaking. - - Parameters - ---------- - lib - Library to load. - symbol - Function to load. - args - Arguments (other than self) passed to this function. - These arguments have to be of type Expression. - kwargs - Non-expression arguments. They must be JSON serializable. - is_elementwise - If the function only operates on scalars - this will trigger fast paths. - input_wildcard_expansion - Expand expressions as input of this function. - returns_scalar - Automatically explode on unit length if it ran as final aggregation. - this is the case for aggregations like `sum`, `min`, `covariance` etc. - cast_to_supertypes - Cast the input datatypes to their supertype. - pass_name_to_apply - if set, then the `Series` passed to the function in the group_by operation - will ensure the name is set. This is an extra heap allocation per group. - changes_length - For example a `unique` or a `slice` - - """ - def _register_plugin(self) -> Self: ... - def take_every(self, n: int) -> Self: - """ - Take every nth value in the Series and return as a new Series. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: - """ - Take values by index. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather`. - - Parameters - ---------- - indices - An expression that leads to a UInt32 dtyped Series. - """ - def cumsum(self) -> Self: - """ - Get an array with the cumulative sum computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_sum`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cumprod(self) -> Self: - """ - Get an array with the cumulative product computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_prod`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cummin(self) -> Self: - """ - Get an array with the cumulative min computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_min`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cummax(self) -> Self: - """ - Get an array with the cumulative max computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_max`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cumcount(self) -> Self: - """ - Get an array with the cumulative count computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_count`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def map_dict(self, mapping: dict[Any, Any]) -> Self: - """ - Replace values in column according to remapping dictionary. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`replace`. The default behavior - has changed to keep any values not present in the mapping unchanged. - Pass `default=None` to keep existing behavior. - - Parameters - ---------- - mapping - Dictionary containing the before/after values to map. - default - Value to use when the remapping dict does not contain the lookup value. - Accepts expression input. Non-expression inputs are parsed as literals. - Use `pl.first()`, to keep the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - - """ - @property - def bin(self): ... - @property - def cat(self): ... - @property - def dt(self): ... - @property - def list(self): ... - @property - def arr(self): ... - @property - def meta(self): ... - @property - def name(self): ... - @property - def str(self): ... - @property - def struct(self): ... -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: - """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/expr/expr.pyi new file mode 100644 index 0000000..f9827ee --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/expr/expr.pyi @@ -0,0 +1,8386 @@ +#: version 0.20.1 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import UInt32 as UInt32 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import _warn_null_comparison as _warn_null_comparison, no_default as no_default, sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self) -> Self: + ''' + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.map`. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + keep_name + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.prefix`. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.suffix`. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.keep`. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).name.keep()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with `^` and end with `$`. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns( + ... pl.all().is_not_null().name.suffix("_not_null") # nan != null + ... ) + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Return the number of non-null elements in the column. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + See Also + -------- + len + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 2 │ + └─────┴─────┘ + ''' + def len(self) -> Self: + ''' + Return the number of elements in the column. + + Null values count towards the total. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + See Also + -------- + count + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + + ''' + def cum_sum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_sum().alias("cum_sum"), + ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_sum ┆ cum_sum_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 10 │ + │ 2 ┆ 3 ┆ 9 │ + │ 3 ┆ 6 ┆ 7 │ + │ 4 ┆ 10 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_sum().alias("value_cum_sum"), + ... pl.col("values") + ... .cum_sum() + ... .forward_fill() + ... .alias("value_cum_sum_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬───────────────┬──────────────────────────┐ + │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═══════════════╪══════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴───────────────┴──────────────────────────┘ + + ''' + def cum_prod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_prod().alias("cum_prod"), + ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), + ... ) + shape: (4, 3) + ┌─────┬──────────┬──────────────────┐ + │ a ┆ cum_prod ┆ cum_prod_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════════╪══════════════════╡ + │ 1 ┆ 1 ┆ 24 │ + │ 2 ┆ 2 ┆ 24 │ + │ 3 ┆ 6 ┆ 12 │ + │ 4 ┆ 24 ┆ 4 │ + └─────┴──────────┴──────────────────┘ + + ''' + def cum_min(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_min().alias("cum_min"), + ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_min ┆ cum_min_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 1 ┆ 3 │ + │ 4 ┆ 1 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + ''' + def cum_max(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_max().alias("cum_max"), + ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_max ┆ cum_max_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 2 ┆ 4 │ + │ 3 ┆ 3 ┆ 4 │ + │ 4 ┆ 4 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_max().alias("cum_max"), + ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬─────────┬────────────────────┐ + │ values ┆ cum_max ┆ cum_max_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════════╪════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴─────────┴────────────────────┘ + + ''' + def cum_count(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_count().alias("cum_count"), + ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), + ... ) + shape: (4, 3) + ┌─────┬───────────┬───────────────────┐ + │ a ┆ cum_count ┆ cum_count_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ u32 ┆ u32 │ + ╞═════╪═══════════╪═══════════════════╡ + │ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 2 ┆ 1 │ + │ 4 ┆ 3 ┆ 0 │ + └─────┴───────────┴───────────────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def round_sig_figs(self, digits: int) -> Self: + ''' + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) + >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) + shape: (3, 2) + ┌─────────┬────────────────┐ + │ a ┆ round_sig_figs │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════════╪════════════════╡ + │ 0.01234 ┆ 0.012 │ + │ 3.333 ┆ 3.3 │ + │ 1234.0 ┆ 1200.0 │ + └─────────┴────────────────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + See Also + -------- + Expr.get : Take a single value + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg( + ... pl.col("value").gather([2, 1]) + ... ) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ one ┆ [2, 98] │ + │ two ┆ [4, 99] │ + └───────┴───────────┘ + ''' + def get(self, index: int | Expr) -> Self: + ''' + Return a single value by index. + + Parameters + ---------- + index + An expression that leads to a UInt32 index. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns(shift=pl.col("a").shift()) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴───────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.with_columns(shift=pl.col("a").shift(-2)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ null │ + │ 4 ┆ null │ + └─────┴───────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ 100 │ + │ 4 ┆ 100 │ + └─────┴───────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().name.suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns( + ... pl.col("c").max().over("a").name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns( + ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns( + ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns( + ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def rolling(self, index_column: str) -> Self: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), + ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for `map` functions is transforming the values + represented by an expression using a third-party library. + + .. warning:: + If you are looking to map a function over a window function or group_by + context, refer to :func:`map_elements` instead. + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_elements + replace + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type `Callable[[Any], Any]`. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type `Callable[[Series], Any]`. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be `pl.Unknown`. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using `map_elements` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using `over` is considered a GroupBy context + here, so `map_elements` can be used to map functions over window groups. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using `over` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").gather_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator `expr & other & ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator `expr | other | ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other` where `None == None`. + + This differs from default `eq` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator `expr >= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ true │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator `expr > other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator `expr <= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator `expr < other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator `expr != other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr != other` where `None == None`. + + This differs from default `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator `expr + other`. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator `expr // other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator `expr % other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator `expr * other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator `expr - other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator `expr / other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator `expr ** exponent`. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator `expr ^ other`. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) + shape: (3, 3) + ┌───────────┬──────────────────┬──────────┐ + │ sets ┆ optional_members ┆ contains │ + │ --- ┆ --- ┆ --- │ + │ list[i64] ┆ i64 ┆ bool │ + ╞═══════════╪══════════════════╪══════════╡ + │ [1, 2, 3] ┆ 1 ┆ true │ + │ [1, 2] ┆ 2 ┆ true │ + │ [9, 10] ┆ 3 ┆ false │ + └───────────┴──────────────────┴──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with `lit` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ NaN │ + │ 3.0 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: + ''' + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) + >>> df.with_columns(clip=pl.col("a").clip(1, 10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + Specifying only a single bound: + + >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def cot(self) -> Self: + ''' + Compute the element-wise value for the cotangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cot().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 0.64 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | IntoExprColumn | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def hist(self, bins: IntoExpr | None = ...) -> Self: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + include_breakpoint + Include a column that indicates the upper breakpoint. + include_category + Include a column that shows the intervals as categories. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 3, 8, 8, 2, 1, 3]}) + >>> df.select(pl.col("a").hist(bins=[1, 2, 3])) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 2 │ + │ 2 │ + └─────┘ + >>> df.select( + ... pl.col("a").hist( + ... bins=[1, 2, 3], include_breakpoint=True, include_category=True + ... ) + ... ) + shape: (4, 1) + ┌───────────────────────┐ + │ a │ + │ --- │ + │ struct[3] │ + ╞═══════════════════════╡ + │ {1.0,"(-inf, 1.0]",2} │ + │ {2.0,"(1.0, 2.0]",1} │ + │ {3.0,"(2.0, 3.0]",2} │ + │ {inf,"(3.0, inf]",2} │ + └───────────────────────┘ + + ''' + def replace(self, old: IntoExpr | Sequence[Any] | Mapping[Any, Any], new: IntoExpr | Sequence[Any] | NoDefault = ...) -> Self: + ''' + Replace values by different values. + + Parameters + ---------- + old + Value or sequence of values to replace. + Accepts expression input. Sequences are parsed as Series, + other non-expression inputs are parsed as literals. + Also accepts a mapping of values to their replacement as syntactic sugar for + `replace(new=Series(mapping.keys()), old=Series(mapping.values()))`. + new + Value or sequence of values to replace by. + Accepts expression input. Sequences are parsed as Series, + other non-expression inputs are parsed as literals. + Length must match the length of `old` or have length 1. + default + Set values that were not replaced to this value. + Defaults to keeping the original value. + Accepts expression input. Non-expression inputs are parsed as literals. + return_dtype + The data type of the resulting expression. If set to `None` (default), + the data type is determined automatically based on the other inputs. + + See Also + -------- + str.replace + + Notes + ----- + The global string cache must be enabled when replacing categorical values. + + Examples + -------- + Replace a single value by another value. Values that were not replaced remain + unchanged. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) + >>> df.with_columns(replaced=pl.col("a").replace(2, 100)) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 3 │ + └─────┴──────────┘ + + Replace multiple values by passing sequences to the `old` and `new` parameters. + + >>> df.with_columns(replaced=pl.col("a").replace([2, 3], [100, 200])) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 200 │ + └─────┴──────────┘ + + Passing a mapping with replacements is also supported as syntactic sugar. + Specify a default to set all values that were not matched. + + >>> mapping = {2: 100, 3: 200} + >>> df.with_columns(replaced=pl.col("a").replace(mapping, default=-1)) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ -1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 200 │ + └─────┴──────────┘ + + Replacing by values of a different data type sets the return type based on + a combination of the `new` data type and either the original data type or the + default data type if it was set. + + >>> df = pl.DataFrame({"a": ["x", "y", "z"]}) + >>> mapping = {"x": 1, "y": 2, "z": 3} + >>> df.with_columns(replaced=pl.col("a").replace(mapping)) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + >>> df.with_columns(replaced=pl.col("a").replace(mapping, default=None)) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + + Set the `return_dtype` parameter to control the resulting data type directly. + + >>> df.with_columns( + ... replaced=pl.col("a").replace(mapping, return_dtype=pl.UInt8) + ... ) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ u8 │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + + Expression input is supported for all parameters. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3], "b": [1.5, 2.5, 5.0, 1.0]}) + >>> df.with_columns( + ... replaced=pl.col("a").replace( + ... old=pl.col("a").max(), + ... new=pl.col("b").sum(), + ... default=pl.col("b"), + ... ) + ... ) + shape: (4, 3) + ┌─────┬─────┬──────────┐ + │ a ┆ b ┆ replaced │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═════╪══════════╡ + │ 1 ┆ 1.5 ┆ 1.5 │ + │ 2 ┆ 2.5 ┆ 2.5 │ + │ 2 ┆ 5.0 ┆ 5.0 │ + │ 3 ┆ 1.0 ┆ 10.0 │ + └─────┴─────┴──────────┘ + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + `polars.Unknown`. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def register_plugin(self) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by `lib::symbol`. + + The parameters you give dictate how polars will deal + with the function. Make sure they are correct! + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + These arguments have to be of type Expression. + kwargs + Non-expression arguments. They must be JSON serializable. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + returns_scalar + Automatically explode on unit length if it ran as final aggregation. + this is the case for aggregations like `sum`, `min`, `covariance` etc. + cast_to_supertypes + Cast the input datatypes to their supertype. + pass_name_to_apply + if set, then the `Series` passed to the function in the group_by operation + will ensure the name is set. This is an extra heap allocation per group. + changes_length + For example a `unique` or a `slice` + + """ + def _register_plugin(self) -> Self: ... + def take_every(self, n: int) -> Self: + """ + Take every nth value in the Series and return as a new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + """ + def cumsum(self) -> Self: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumprod(self) -> Self: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummin(self) -> Self: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummax(self) -> Self: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumcount(self) -> Self: + """ + Get an array with the cumulative count computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_count`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in column according to remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def name(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/lazyframe/frame deleted file mode 100644 index 561f5b2..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/lazyframe/frame +++ /dev/null @@ -1,4211 +0,0 @@ -import P -import np -import pa -from builtins import PyLazyFrame -from pathlib import Path -from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 -from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype -from polars.dependencies import dataframe_api_compat as dataframe_api_compat -from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud -from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy -from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath -from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence - -TYPE_CHECKING: bool -DTYPE_TEMPORAL_UNITS: frozenset -N_INFER_DEFAULT: int - -class LazyFrame: - _accessors: _ClassVar[set] = ... - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - @classmethod - def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: - """ - Lazily read from a CSV file or multiple files via glob patterns. - - Use `pl.scan_csv` to dispatch to this method. - - See Also - -------- - polars.io.scan_csv - - """ - @classmethod - def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: - """ - Lazily read from a parquet file or multiple files via glob patterns. - - Use `pl.scan_parquet` to dispatch to this method. - - See Also - -------- - polars.io.scan_parquet - - """ - @classmethod - def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: - """ - Lazily read from an Arrow IPC (Feather v2) file. - - Use `pl.scan_ipc` to dispatch to this method. - - See Also - -------- - polars.io.scan_ipc - - """ - @classmethod - def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: - """ - Lazily read from a newline delimited JSON file. - - Use `pl.scan_ndjson` to dispatch to this method. - - See Also - -------- - polars.io.scan_ndjson - - """ - @classmethod - def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: - """ - Read a logical plan from a JSON string to construct a LazyFrame. - - .. deprecated:: 0.18.12 - This method is deprecated. Convert the JSON string to `StringIO` - and then use `LazyFrame.deserialize`. - - Parameters - ---------- - json - String in JSON format. - - See Also - -------- - deserialize - - """ - @classmethod - def read_json(cls, source: str | Path | IOBase) -> Self: - """ - Read a logical plan from a JSON file to construct a LazyFrame. - - .. deprecated:: 0.18.12 - This class method has been renamed to `deserialize`. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - - See Also - -------- - deserialize - - """ - @classmethod - def deserialize(cls, source: str | Path | IOBase) -> Self: - ''' - Read a logical plan from a JSON file to construct a LazyFrame. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - - See Also - -------- - LazyFrame.serialize - - Examples - -------- - >>> import io - >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() - >>> json = lf.serialize() - >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 6 │ - └─────┘ - - ''' - def __dataframe_consortium_standard__(self) -> Any: - """ - Provide entry point to the Consortium DataFrame Standard API. - - This is developed and maintained outside of polars. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. - """ - def __bool__(self) -> NoReturn: ... - def _comparison_error(self, operator: str) -> NoReturn: ... - def __eq__(self, other: Any) -> NoReturn: ... - def __ne__(self, other: Any) -> NoReturn: ... - def __gt__(self, other: Any) -> NoReturn: ... - def __lt__(self, other: Any) -> NoReturn: ... - def __ge__(self, other: Any) -> NoReturn: ... - def __le__(self, other: Any) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def _repr_html_(self) -> str: ... - def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: - ''' - Serialize the logical plan of this LazyFrame to a file or string in JSON format. - - Parameters - ---------- - file - File path to which the result should be written. If set to `None` - (default), the output is returned as a string instead. - - See Also - -------- - LazyFrame.deserialize - - Examples - -------- - Serialize the logical plan into a JSON string. - - >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() - >>> json = lf.serialize() - >>> json - \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' - - The logical plan can later be deserialized back into a LazyFrame. - - >>> import io - >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 6 │ - └─────┘ - - ''' - def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: - """ - Serialize the logical plan of this LazyFrame to a file or string in JSON format. - - .. deprecated:: 0.18.12 - This method has been renamed to :func:`LazyFrame.serialize`. - - Parameters - ---------- - file - File path to which the result should be written. If set to `None` - (default), the output is returned as a string instead. - """ - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: - ''' - Offers a structured way to apply a sequence of user-defined functions (UDFs). - - Parameters - ---------- - function - Callable; will receive the frame as the first parameter, - followed by any given args/kwargs. - *args - Arguments to pass to the UDF. - **kwargs - Keyword arguments to pass to the UDF. - - Examples - -------- - >>> def cast_str_to_int(data, col_name): - ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) - ... - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": ["10", "20", "30", "40"], - ... } - ... ) - >>> lf.pipe(cast_str_to_int, col_name="b").collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 10 │ - │ 2 ┆ 20 │ - │ 3 ┆ 30 │ - │ 4 ┆ 40 │ - └─────┴─────┘ - - >>> lf = pl.LazyFrame( - ... { - ... "b": [1, 2], - ... "a": [3, 4], - ... } - ... ) - >>> lf.collect() - shape: (2, 2) - ┌─────┬─────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - └─────┴─────┘ - >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 1 │ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def explain(self) -> str: - ''' - Create a string representation of the query plan. - - Different optimizations can be turned on or off. - - Parameters - ---------- - optimized - Return an optimized query plan. Defaults to `True`. - If this is set to `True` the subsequent - optimization flags control which optimizations - run. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( - ... "a" - ... ).explain() # doctest: +SKIP - ''' - def show_graph(self) -> str | None: - ''' - Show a plot of the query plan. Note that you should have graphviz installed. - - Parameters - ---------- - optimized - Optimize the query plan. - show - Show the figure. - output_path - Write the figure to disk. - raw_output - Return dot syntax. This cannot be combined with `show` and/or `output_path`. - figsize - Passed to matplotlib if `show` == True. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( - ... "a" - ... ).show_graph() # doctest: +SKIP - - ''' - def inspect(self, fmt: str = ...) -> Self: - ''' - Inspect a node in the computation graph. - - Print the value that this node in the computation graph evaluates to and passes - on the value. - - Examples - -------- - >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) - >>> ( - ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) - ... .inspect() # print the node before the filter - ... .filter(pl.col("bar") == pl.col("foo")) - ... ) # doctest: +ELLIPSIS - - - ''' - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: - ''' - Sort the DataFrame by the given columns. - - Parameters - ---------- - by - Column(s) to sort by. Accepts expression input. Strings are parsed as column - names. - *more_by - Additional columns to sort by, specified as positional arguments. - descending - Sort in descending order. When sorting by multiple columns, can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - Examples - -------- - Pass a single column name to sort by that column. - - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, None], - ... "b": [6.0, 5.0, 4.0], - ... "c": ["a", "c", "b"], - ... } - ... ) - >>> lf.sort("a").collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - Sorting by expressions is also supported. - - >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - └──────┴─────┴─────┘ - - Sort by multiple columns by passing a list of columns. - - >>> lf.sort(["c", "a"], descending=True).collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - └──────┴─────┴─────┘ - - Or use positional arguments to sort by multiple columns in the same way. - - >>> lf.sort("c", "a", descending=[False, True]).collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - ''' - def top_k(self, k: int) -> Self: - ''' - Return the `k` largest elements. - - If \'descending=True` the smallest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might - be worse since this requires a stable search. - - See Also - -------- - bottom_k - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 largest values in column b. - - >>> lf.top_k(4, by="b").collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ a ┆ 2 │ - │ b ┆ 2 │ - │ b ┆ 1 │ - └─────┴─────┘ - - Get the rows which contain the 4 largest values when sorting on column b and a. - - >>> lf.top_k(4, by=["b", "a"]).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 2 │ - │ c ┆ 1 │ - └─────┴─────┘ - - ''' - def bottom_k(self, k: int) -> Self: - ''' - Return the `k` smallest elements. - - If \'descending=True` the largest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - See Also - -------- - top_k - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 smallest values in column b. - - >>> lf.bottom_k(4, by="b").collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 1 │ - │ a ┆ 1 │ - │ c ┆ 1 │ - │ a ┆ 2 │ - └─────┴─────┘ - - Get the rows which contain the 4 smallest values when sorting on column a and b. - - >>> lf.bottom_k(4, by=["a", "b"]).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ b ┆ 1 │ - │ b ┆ 2 │ - └─────┴─────┘ - - ''' - def profile(self) -> tuple[DataFrame, DataFrame]: - ''' - Profile a LazyFrame. - - This will run the query and return a tuple - containing the materialized DataFrame and a DataFrame that - contains profiling information of each node that is executed. - - The units of the timings are microseconds. - - Parameters - ---------- - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off (certain) optimizations. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - show_plot - Show a gantt chart of the profiling result - truncate_nodes - Truncate the label lengths in the gantt chart to this number of - characters. - figsize - matplotlib figsize of the profiling plot - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( - ... "a" - ... ).profile() # doctest: +SKIP - (shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘, - shape: (3, 3) - ┌─────────────────────────┬───────┬──────┐ - │ node ┆ start ┆ end │ - │ --- ┆ --- ┆ --- │ - │ str ┆ u64 ┆ u64 │ - ╞═════════════════════════╪═══════╪══════╡ - │ optimization ┆ 0 ┆ 5 │ - │ group_by_partitioned(a) ┆ 5 ┆ 470 │ - │ sort(a) ┆ 475 ┆ 1964 │ - └─────────────────────────┴───────┴──────┘) - - ''' - def collect(self) -> DataFrame: - ''' - Materialize this LazyFrame into a DataFrame. - - By default, all query optimizations are enabled. Individual optimizations may - be disabled by setting the corresponding parameter to `False`. - - Parameters - ---------- - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - no_optimization - Turn off (certain) optimizations. - streaming - Process the query in batches to handle larger-than-memory data. - If set to `False` (default), the entire query is processed in a single - batch. - - .. warning:: - This functionality is currently in an alpha state. - - .. note:: - Use :func:`explain` to see if Polars can process the query in streaming - mode. - - Returns - ------- - DataFrame - - See Also - -------- - fetch: Run the query on the first `n` rows only for debugging purposes. - explain : Print the query plan that is evaluated with collect. - profile : Collect the LazyFrame and time each node in the computation graph. - polars.collect_all : Collect multiple LazyFrames at the same time. - polars.Config.set_streaming_chunk_size : Set the size of streaming batches. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘ - - Collect in streaming mode - - >>> lf.group_by("a").agg(pl.all().sum()).collect( - ... streaming=True - ... ) # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘ - - ''' - def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: - ''' - Collect DataFrame asynchronously in thread pool. - - Collects into a DataFrame (like :func:`collect`), but instead of returning - DataFrame directly, they are scheduled to be collected inside thread pool, - while this method returns almost instantly. - - May be useful if you use gevent or asyncio and want to release control to other - greenlets/tasks while LazyFrames are being collected. - - Parameters - ---------- - gevent - Return wrapper to `gevent.event.AsyncResult` instead of Awaitable - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off (certain) optimizations. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Notes - ----- - In case of error `set_exception` is used on - `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - See Also - -------- - polars.collect_all : Collect multiple LazyFrames at the same time. - polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. - - Returns - ------- - If `gevent=False` (default) then returns awaitable. - - If `gevent=True` then returns wrapper that has - `.get(block=True, timeout=None)` method. - - Examples - -------- - >>> import asyncio - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> async def main(): - ... return await ( - ... lf.group_by("a", maintain_order=True) - ... .agg(pl.all().sum()) - ... .collect_async() - ... ) - ... - >>> asyncio.run(main()) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘ - ''' - def sink_parquet(self, path: str | Path) -> DataFrame: - ''' - Evaluate the query in streaming mode and write to a Parquet file. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} - Choose "zstd" for good compression performance. - Choose "lz4" for fast compression/decompression. - Choose "snappy" for more backwards compatibility guarantees - when you deal with older parquet readers. - compression_level - The level of compression to use. Higher compression means smaller files on - disk. - - - "gzip" : min-level: 0, max-level: 10. - - "brotli" : min-level: 0, max-level: 11. - - "zstd" : min-level: 1, max-level: 22. - statistics - Write statistics to the parquet headers. This requires extra compute. - row_group_size - Size of the row groups in number of rows. - If None (default), the chunks of the `DataFrame` are - used. Writing in smaller chunks may reduce memory pressure and improve - writing speeds. - data_pagesize_limit - Size limit of individual data pages. - If not set defaults to 1024 * 1024 bytes - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - no_optimization - Turn off (certain) optimizations. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_parquet("out.parquet") # doctest: +SKIP - - ''' - def sink_ipc(self, path: str | Path) -> DataFrame: - ''' - Evaluate the query in streaming mode and write to an IPC file. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - compression : {\'lz4\', \'zstd\'} - Choose "zstd" for good compression performance. - Choose "lz4" for fast compression/decompression. - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - no_optimization - Turn off (certain) optimizations. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_ipc("out.arrow") # doctest: +SKIP - - ''' - def sink_csv(self, path: str | Path) -> DataFrame: - ''' - Evaluate the query in streaming mode and write to a CSV file. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - include_bom - Whether to include UTF-8 BOM in the CSV output. - include_header - Whether to include header in the CSV output. - separator - Separate CSV fields with this symbol. - line_terminator - String used to end each row. - quote_char - Byte to use as quoting character. - batch_size - Number of rows that will be processed per thread. - datetime_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. If no format specified, the default fractional-second - precision is inferred from the maximum timeunit found in the frame\'s - Datetime cols (if any). - date_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - time_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - float_precision - Number of decimal places to write, applied to both `Float32` and - `Float64` datatypes. - null_value - A string representing null values (defaulting to the empty string). - quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} - Determines the quoting strategy used. - - - necessary (default): This puts quotes around fields only when necessary. - They are necessary when fields contain a quote, - delimiter or record terminator. - Quotes are also necessary when writing an empty record - (which is indistinguishable from a record with one empty field). - This is the default. - - always: This puts quotes around every field. Always. - - never: This never puts quotes around fields, even if that results in - invalid CSV data (e.g.: by not quoting strings containing the - separator). - - non_numeric: This puts quotes around all fields that are non-numeric. - Namely, when writing a field that does not parse as a valid float - or integer, then quotes will be used even if they aren`t strictly - necessary. - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - no_optimization - Turn off (certain) optimizations. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_csv("out.csv") # doctest: +SKIP - - ''' - def sink_ndjson(self, path: str | Path) -> DataFrame: - ''' - Persists a LazyFrame at the provided path. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off (certain) optimizations. - slice_pushdown - Slice pushdown optimization. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_json("out.json") # doctest: +SKIP - - ''' - def _set_sink_optimizations(self) -> PyLazyFrame: ... - def fetch(self, n_rows: int = ...) -> DataFrame: - ''' - Collect a small number of rows for debugging purposes. - - Parameters - ---------- - n_rows - Collect n_rows from the data sources. - type_coercion - Run type coercion optimization. - predicate_pushdown - Run predicate pushdown optimization. - projection_pushdown - Run projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off optimizations. - slice_pushdown - Slice pushdown optimization - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Notes - ----- - This is similar to a :func:`collect` operation, but it overwrites the number of - rows read by *every* scan operation. Be aware that `fetch` does not guarantee - the final number of rows in the DataFrame. Filters, join operations and fewer - rows being available in the scanned data will all influence the final number - of rows (joins are especially susceptible to this, and may return no data - at all if `n_rows` is too small as the join keys may not be present). - - Warnings - -------- - This is strictly a utility function that can help to debug queries using a - smaller number of rows, and should *not* be used in production code. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 6 │ - │ b ┆ 2 ┆ 5 │ - └─────┴─────┴─────┘ - - ''' - def lazy(self) -> Self: - ''' - Return lazy representation, i.e. itself. - - Useful for writing code that expects either a :class:`DataFrame` or - :class:`LazyFrame`. - - Returns - ------- - LazyFrame - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> lf.lazy() # doctest: +ELLIPSIS - - - ''' - def cache(self) -> Self: - """Cache the result once the execution of the physical plan hits this node.""" - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: - ''' - Cast LazyFrame column(s) to the specified dtype(s). - - Parameters - ---------- - dtypes - Mapping of column names (or selector) to dtypes, or a single dtype - to which all columns will be cast. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> from datetime import date - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], - ... } - ... ) - - Cast specific frame columns to the specified dtypes: - - >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ u8 ┆ date │ - ╞═════╪═════╪════════════╡ - │ 1.0 ┆ 6 ┆ 2020-01-02 │ - │ 2.0 ┆ 7 ┆ 2021-03-04 │ - │ 3.0 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - Cast all frame columns to the specified dtype: - - >>> lf.cast(pl.Utf8).collect().to_dict(as_series=False) - {\'foo\': [\'1\', \'2\', \'3\'], - \'bar\': [\'6.0\', \'7.0\', \'8.0\'], - \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} - - Use selectors to define the columns being cast: - - >>> import polars.selectors as cs - >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ str │ - ╞═════╪═════╪════════════╡ - │ 1 ┆ 6 ┆ 2020-01-02 │ - │ 2 ┆ 7 ┆ 2021-03-04 │ - │ 3 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - ''' - def clear(self, n: int = ...) -> LazyFrame: - ''' - Create an empty copy of the current LazyFrame, with zero to \'n\' rows. - - Returns a copy with an identical schema but no data. - - Parameters - ---------- - n - Number of (empty) rows to return in the cleared frame. - - See Also - -------- - clone : Cheap deepcopy/clone. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> lf.clear().fetch() - shape: (0, 3) - ┌─────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞═════╪═════╪══════╡ - └─────┴─────┴──────┘ - - >>> lf.clear(2).fetch() - shape: (2, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪══════╪══════╡ - │ null ┆ null ┆ null │ - │ null ┆ null ┆ null │ - └──────┴──────┴──────┘ - - ''' - def clone(self) -> Self: - ''' - Create a copy of this LazyFrame. - - This is a cheap operation that does not copy data. - - See Also - -------- - clear : Create an empty copy of the current LazyFrame, with identical - schema but no data. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> lf.clone() # doctest: +ELLIPSIS - - - ''' - def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: - ''' - Filter the rows in the LazyFrame based on a predicate expression. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - predicates - Expression that evaluates to a boolean Series. - constraints - Column filters. Use name=value to filter column name by the supplied value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - - Filter on one condition: - - >>> lf.filter(pl.col("foo") > 1).collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Filter on multiple conditions: - - >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Provide multiple filters using `*args` syntax: - - >>> lf.filter( - ... pl.col("foo") == 1, - ... pl.col("ham") == "a", - ... ).collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Provide multiple filters using `**kwargs` syntax: - - >>> lf.filter(foo=1, ham="a").collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Filter on an OR condition: - - >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - ''' - Select columns from this LazyFrame. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Examples - -------- - Pass the name of a column to select that column. - - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.select("foo").collect() - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - Multiple columns can be selected by passing a list of column names. - - >>> lf.select(["foo", "bar"]).collect() - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 6 │ - │ 2 ┆ 7 │ - │ 3 ┆ 8 │ - └─────┴─────┘ - - Multiple columns can also be selected using positional arguments instead of a - list. Expressions are also accepted. - - >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - └─────┴─────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> lf.select( - ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) - ... ).collect() - shape: (3, 1) - ┌───────────┐ - │ threshold │ - │ --- │ - │ i32 │ - ╞═══════════╡ - │ 0 │ - │ 0 │ - │ 10 │ - └───────────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... lf.select( - ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), - ... ).collect() - ... - shape: (3, 1) - ┌───────────┐ - │ is_odd │ - │ --- │ - │ struct[2] │ - ╞═══════════╡ - │ {1,0} │ - │ {0,1} │ - │ {1,0} │ - └───────────┘ - - ''' - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - """ - Select columns from this LazyFrame. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - See Also - -------- - select - - """ - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: - ''' - Start a group by operation. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Setting this to `True` blocks the possibility - to run on the streaming engine. - - Examples - -------- - Group by one column and call `agg` to compute the grouped sum of another - column. - - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "c"], - ... "b": [1, 2, 1, 3, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 2 │ - │ b ┆ 5 │ - │ c ┆ 3 │ - └─────┴─────┘ - - Set `maintain_order=True` to ensure the order of the groups is consistent with - the input. - - >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() - shape: (3, 2) - ┌─────┬───────────┐ - │ a ┆ c │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════╪═══════════╡ - │ a ┆ [5, 3] │ - │ b ┆ [4, 2] │ - │ c ┆ [1] │ - └─────┴───────────┘ - - Group by multiple columns by passing a list of column names. - - >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - - Or use positional arguments to group by multiple columns in the same way. - Expressions are also accepted. - - >>> lf.group_by("a", pl.col("b") // 2).agg( - ... pl.col("c").mean() - ... ).collect() # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ f64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 0 ┆ 4.0 │ - │ b ┆ 1 ┆ 3.0 │ - │ c ┆ 1 ┆ 1.0 │ - └─────┴─────┴─────┘ - - ''' - def rolling(self, index_column: IntoExpr) -> LazyGroupBy: - ''' - Create rolling groups based on a time, Int32, or Int64 column. - - Different from a `dynamic_group_by` the windows are now determined by the - individual values and are not of constant intervals. For constant intervals - use :func:`LazyFrame.group_by_dynamic`. - - If you have a time series ``, then by default the - windows created will be - - * (t_0 - period, t_0] - * (t_1 - period, t_1] - * ... - * (t_n - period, t_n] - - whereas if you pass a non-default `offset`, then the windows will be - - * (t_0 + offset, t_0 + offset + period] - * (t_1 + offset, t_1 + offset + period] - * ... - * (t_n + offset, t_n + offset + period] - - The `period` and `offset` arguments are created either from a timedelta, or - by using the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a rolling operation on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - group_by_dynamic - - Examples - -------- - >>> dates = [ - ... "2020-01-01 13:45:48", - ... "2020-01-01 16:42:13", - ... "2020-01-01 16:45:09", - ... "2020-01-02 18:12:48", - ... "2020-01-03 19:45:32", - ... "2020-01-08 23:16:43", - ... ] - >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( - ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() - ... ) - >>> out = ( - ... df.rolling(index_column="dt", period="2d") - ... .agg( - ... pl.sum("a").alias("sum_a"), - ... pl.min("a").alias("min_a"), - ... pl.max("a").alias("max_a"), - ... ) - ... .collect() - ... ) - >>> out - shape: (6, 4) - ┌─────────────────────┬───────┬───────┬───────┐ - │ dt ┆ sum_a ┆ min_a ┆ max_a │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞═════════════════════╪═══════╪═══════╪═══════╡ - │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ - │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ - │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ - │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ - │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ - │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ - └─────────────────────┴───────┴───────┴───────┘ - - ''' - def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - Time windows are calculated and rows are assigned to windows. Different from a - normal group by is that a row can be member of multiple groups. - By default, the windows look like: - - - [start, start + period) - - [start + every, start + every + period) - - [start + 2*every, start + 2*every + period) - - ... - - where `start` is determined by `start_by`, `offset`, and `every` (see parameter - descriptions below). - - .. warning:: - The index column must be sorted in ascending order. If `by` is passed, then - the index column must be sorted in ascending order within each group. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - - .. deprecated:: 0.19.4 - Use `label` instead. - include_boundaries - Add the lower and upper bound of the window to the "_lower_boundary" and - "_upper_boundary" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - label : {\'left\', \'right\', \'datapoint\'} - Define which label to use for the window: - - - \'left\': lower boundary of the window - - \'right\': upper boundary of the window - - \'datapoint\': the first value of the index column in the given window. - If you don\'t need the label to be at one of the boundaries, choose this - option for maximum performance - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - rolling - - Notes - ----- - 1) If you\'re coming from pandas, then - - .. code-block:: python - - # polars - df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) - - is equivalent to - - .. code-block:: python - - # pandas - df.set_index("ts").resample("D")["value"].sum().reset_index() - - though note that, unlike pandas, polars doesn\'t add extra rows for empty - windows. If you need `index_column` to be evenly spaced, then please combine - with :func:`DataFrame.upsample`. - - 2) The `every`, `period` and `offset` arguments are created with - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a group_by_dynamic on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Examples - -------- - >>> from datetime import datetime - >>> lf = pl.LazyFrame( - ... { - ... "time": pl.datetime_range( - ... start=datetime(2021, 12, 16), - ... end=datetime(2021, 12, 16, 3), - ... interval="30m", - ... eager=True, - ... ), - ... "n": range(7), - ... } - ... ) - >>> lf.collect() - shape: (7, 2) - ┌─────────────────────┬─────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i64 │ - ╞═════════════════════╪═════╡ - │ 2021-12-16 00:00:00 ┆ 0 │ - │ 2021-12-16 00:30:00 ┆ 1 │ - │ 2021-12-16 01:00:00 ┆ 2 │ - │ 2021-12-16 01:30:00 ┆ 3 │ - │ 2021-12-16 02:00:00 ┆ 4 │ - │ 2021-12-16 02:30:00 ┆ 5 │ - │ 2021-12-16 03:00:00 ┆ 6 │ - └─────────────────────┴─────┘ - - Group by windows of 1 hour starting at 2021-12-16 00:00:00. - - >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( - ... pl.col("n") - ... ).collect() - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [1, 2] │ - │ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ 2021-12-16 02:00:00 ┆ [5, 6] │ - └─────────────────────┴───────────┘ - - The window boundaries can also be added to the aggregation result - - >>> lf.group_by_dynamic( - ... "time", every="1h", include_boundaries=True, closed="right" - ... ).agg(pl.col("n").mean()).collect() - shape: (4, 4) - ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ - │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ - ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ - │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ - │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ - │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ - │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ - └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ - - When closed="left", the window excludes the right end of interval: - [lower_bound, upper_bound) - - >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( - ... pl.col("n") - ... ).collect() - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-16 00:00:00 ┆ [0, 1] │ - │ 2021-12-16 01:00:00 ┆ [2, 3] │ - │ 2021-12-16 02:00:00 ┆ [4, 5] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - When closed="both" the time values at the window boundaries belong to 2 groups. - - >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( - ... pl.col("n") - ... ).collect() - shape: (5, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ - │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - Dynamic group bys can also be combined with grouping on normal keys - - >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) - >>> lf.collect() - shape: (7, 3) - ┌─────────────────────┬─────┬────────┐ - │ time ┆ n ┆ groups │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ str │ - ╞═════════════════════╪═════╪════════╡ - │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ - │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ - │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ - │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ - │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ - │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ - │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ - └─────────────────────┴─────┴────────┘ - >>> lf.group_by_dynamic( - ... "time", - ... every="1h", - ... closed="both", - ... by="groups", - ... include_boundaries=True, - ... ).agg(pl.col("n")).collect() - shape: (7, 5) - ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ - │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ - ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ - │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ - │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ - │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ - │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ - │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ - └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ - - Dynamic group by on an index column - - >>> lf = pl.LazyFrame( - ... { - ... "idx": pl.int_range(0, 6, eager=True), - ... "A": ["A", "A", "B", "B", "B", "C"], - ... } - ... ) - >>> lf.group_by_dynamic( - ... "idx", - ... every="2i", - ... period="3i", - ... include_boundaries=True, - ... closed="right", - ... ).agg(pl.col("A").alias("A_agg_list")).collect() - shape: (4, 4) - ┌─────────────────┬─────────────────┬─────┬─────────────────┐ - │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 ┆ list[str] │ - ╞═════════════════╪═════════════════╪═════╪═════════════════╡ - │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ - │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ - │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ - │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ - └─────────────────┴─────────────────┴─────┴─────────────────┘ - - ''' - def join_asof(self, other: LazyFrame) -> Self: - ''' - Perform an asof join. - - This is similar to a left-join except that we match on nearest key rather than - equal keys. - - Both DataFrames must be sorted by the join_asof key. - - For each row in the left DataFrame: - - - A "backward" search selects the last row in the right DataFrame whose - \'on\' key is less than or equal to the left\'s key. - - - A "forward" search selects the first row in the right DataFrame whose - \'on\' key is greater than or equal to the left\'s key. - - A "nearest" search selects the last row in the right DataFrame whose value - is nearest to the left\'s key. String keys are not currently supported for a - nearest search. - - The default is "backward". - - Parameters - ---------- - other - Lazy DataFrame to join with. - left_on - Join column of the left DataFrame. - right_on - Join column of the right DataFrame. - on - Join column of both DataFrames. If set, `left_on` and `right_on` should be - None. - by - Join on these columns before doing asof join. - by_left - Join on these columns before doing asof join. - by_right - Join on these columns before doing asof join. - strategy : {\'backward\', \'forward\', \'nearest\'} - Join strategy. - suffix - Suffix to append to columns with a duplicate name. - tolerance - Numeric tolerance. By setting this the join will only be done if the near - keys are within this distance. If an asof join is done on columns of dtype - "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta - object or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - allow_parallel - Allow the physical plan to optionally evaluate the computation of both - DataFrames up to the join in parallel. - force_parallel - Force the physical plan to evaluate the computation of both DataFrames up to - the join in parallel. - - Examples - -------- - >>> from datetime import datetime - >>> gdp = pl.LazyFrame( - ... { - ... "date": [ - ... datetime(2016, 1, 1), - ... datetime(2017, 1, 1), - ... datetime(2018, 1, 1), - ... datetime(2019, 1, 1), - ... ], # note record date: Jan 1st (sorted!) - ... "gdp": [4164, 4411, 4566, 4696], - ... } - ... ).set_sorted("date") - >>> population = pl.LazyFrame( - ... { - ... "date": [ - ... datetime(2016, 5, 12), - ... datetime(2017, 5, 12), - ... datetime(2018, 5, 12), - ... datetime(2019, 5, 12), - ... ], # note record date: May 12th (sorted!) - ... "population": [82.19, 82.66, 83.12, 83.52], - ... } - ... ).set_sorted("date") - >>> population.join_asof(gdp, on="date", strategy="backward").collect() - shape: (4, 3) - ┌─────────────────────┬────────────┬──────┐ - │ date ┆ population ┆ gdp │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ f64 ┆ i64 │ - ╞═════════════════════╪════════════╪══════╡ - │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ - │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ - │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ - │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ - └─────────────────────┴────────────┴──────┘ - - ''' - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: - ''' - Add a join operation to the Logical Plan. - - Parameters - ---------- - other - Lazy DataFrame to join with. - on - Join column of both DataFrames. If set, `left_on` and `right_on` should be - None. - how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} - Join strategy. - - .. note:: - A left join preserves the row order of the left DataFrame. - left_on - Join column of the left DataFrame. - right_on - Join column of the right DataFrame. - suffix - Suffix to append to columns with a duplicate name. - validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} - Checks if join is of specified type. - - * *many_to_many* - “m:m”: default, does not result in checks - * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets - * *one_to_many* - “1:m”: check if join keys are unique in left dataset - * *many_to_one* - “m:1”: check if join keys are unique in right dataset - - .. note:: - - - This is currently not supported the streaming engine. - - This is only supported when joined by single columns. - allow_parallel - Allow the physical plan to optionally evaluate the computation of both - DataFrames up to the join in parallel. - force_parallel - Force the physical plan to evaluate the computation of both DataFrames up to - the join in parallel. - - See Also - -------- - join_asof - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> other_lf = pl.LazyFrame( - ... { - ... "apple": ["x", "y", "z"], - ... "ham": ["a", "b", "d"], - ... } - ... ) - >>> lf.join(other_lf, on="ham").collect() - shape: (2, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - └─────┴─────┴─────┴───────┘ - >>> lf.join(other_lf, on="ham", how="outer").collect() - shape: (4, 4) - ┌──────┬──────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞══════╪══════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ null ┆ null ┆ d ┆ z │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └──────┴──────┴─────┴───────┘ - >>> lf.join(other_lf, on="ham", how="left").collect() - shape: (3, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └─────┴─────┴─────┴───────┘ - >>> lf.join(other_lf, on="ham", how="semi").collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 7.0 ┆ b │ - └─────┴─────┴─────┘ - >>> lf.join(other_lf, on="ham", how="anti").collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - ''' - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - LazyFrame - A new LazyFrame with the columns added. - - Notes - ----- - Creating a new LazyFrame using this method does not create a new copy of - existing data. - - Examples - -------- - Pass an expression to add it as a new column. - - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() - shape: (4, 4) - ┌─────┬──────┬───────┬──────┐ - │ a ┆ b ┆ c ┆ a^2 │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 │ - ╞═════╪══════╪═══════╪══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ - └─────┴──────┴───────┴──────┘ - - Added columns will replace existing columns with the same name. - - >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() - shape: (4, 3) - ┌─────┬──────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╡ - │ 1.0 ┆ 0.5 ┆ true │ - │ 2.0 ┆ 4.0 ┆ true │ - │ 3.0 ┆ 10.0 ┆ false │ - │ 4.0 ┆ 13.0 ┆ true │ - └─────┴──────┴───────┘ - - Multiple columns can be added by passing a list of expressions. - - >>> lf.with_columns( - ... [ - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ] - ... ).collect() - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Multiple columns also can be added using positional arguments instead of a list. - - >>> lf.with_columns( - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ).collect() - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> lf.with_columns( - ... ab=pl.col("a") * pl.col("b"), - ... not_c=pl.col("c").not_(), - ... ).collect() - shape: (4, 5) - ┌─────┬──────┬───────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ ab ┆ not_c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ - └─────┴──────┴───────┴──────┴───────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... lf.drop("c").with_columns( - ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), - ... ).collect() - ... - shape: (4, 3) - ┌─────┬──────┬─────────────┐ - │ a ┆ b ┆ diffs │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ struct[2] │ - ╞═════╪══════╪═════════════╡ - │ 1 ┆ 0.5 ┆ {null,null} │ - │ 2 ┆ 4.0 ┆ {1,3.5} │ - │ 3 ┆ 10.0 ┆ {1,6.0} │ - │ 4 ┆ 13.0 ┆ {1,3.0} │ - └─────┴──────┴─────────────┘ - - ''' - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - """ - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - LazyFrame - A new LazyFrame with the columns added. - - See Also - -------- - with_columns - - """ - def with_context(self, other: Self | list[Self]) -> Self: - ''' - Add an external context to the computation graph. - - This allows expressions to also access columns from DataFrames - that are not part of this one. - - Parameters - ---------- - other - Lazy DataFrame to join with. - - Examples - -------- - >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) - >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) - >>> lf.with_context(lf_other).select( - ... pl.col("b") + pl.col("c").first() - ... ).collect() - shape: (3, 1) - ┌──────┐ - │ b │ - │ --- │ - │ str │ - ╞══════╡ - │ afoo │ - │ cfoo │ - │ null │ - └──────┘ - - Fill nulls with the median from another DataFrame: - - >>> train_lf = pl.LazyFrame( - ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} - ... ) - >>> test_lf = pl.LazyFrame( - ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} - ... ) - >>> test_lf.with_context( - ... train_lf.select(pl.all().name.suffix("_train")) - ... ).select( - ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) - ... ).collect() - shape: (3, 1) - ┌───────────┐ - │ feature_0 │ - │ --- │ - │ f64 │ - ╞═══════════╡ - │ -1.0 │ - │ 0.0 │ - │ 1.0 │ - └───────────┘ - - ''' - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: - ''' - Remove columns from the DataFrame. - - Parameters - ---------- - columns - Name of the column(s) that should be removed from the DataFrame. - *more_columns - Additional columns to drop, specified as positional arguments. - - Examples - -------- - Drop a single column by passing the name of that column. - - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.drop("ham").collect() - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪═════╡ - │ 1 ┆ 6.0 │ - │ 2 ┆ 7.0 │ - │ 3 ┆ 8.0 │ - └─────┴─────┘ - - Drop multiple columns by passing a selector. - - >>> import polars.selectors as cs - >>> lf.drop(cs.numeric()).collect() - shape: (3, 1) - ┌─────┐ - │ ham │ - │ --- │ - │ str │ - ╞═════╡ - │ a │ - │ b │ - │ c │ - └─────┘ - - Use positional arguments to drop multiple columns. - - >>> lf.drop("foo", "ham").collect() - shape: (3, 1) - ┌─────┐ - │ bar │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 6.0 │ - │ 7.0 │ - │ 8.0 │ - └─────┘ - - ''' - def rename(self, mapping: dict[str, str]) -> Self: - ''' - Rename column names. - - Parameters - ---------- - mapping - Key value pairs that map from old name to new name. - - Notes - ----- - If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), - polars will block projection and predicate pushdowns at this node. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.rename({"foo": "apple"}).collect() - shape: (3, 3) - ┌───────┬─────┬─────┐ - │ apple ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═══════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └───────┴─────┴─────┘ - - ''' - def reverse(self) -> Self: - ''' - Reverse the DataFrame. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "key": ["a", "b", "c"], - ... "val": [1, 2, 3], - ... } - ... ) - >>> lf.reverse().collect() - shape: (3, 2) - ┌─────┬─────┐ - │ key ┆ val │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ c ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 1 │ - └─────┴─────┘ - - ''' - def shift(self, n: int | IntoExprColumn = ...) -> Self: - ''' - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. Accepts expression input. - Non-expression inputs are parsed as literals. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [5, 6, 7, 8], - ... } - ... ) - >>> lf.shift().collect() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ null ┆ null │ - │ 1 ┆ 5 │ - │ 2 ┆ 6 │ - │ 3 ┆ 7 │ - └──────┴──────┘ - - Pass a negative value to shift in the opposite direction instead. - - >>> lf.shift(-2).collect() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ null ┆ null │ - │ null ┆ null │ - └──────┴──────┘ - - Specify `fill_value` to fill the resulting null values. - - >>> lf.shift(-2, fill_value=100).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ 100 ┆ 100 │ - │ 100 ┆ 100 │ - └─────┴─────┘ - - ''' - def slice(self, offset: int, length: int | None = ...) -> Self: - ''' - Get a slice of this DataFrame. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["x", "y", "z"], - ... "b": [1, 3, 5], - ... "c": [2, 4, 6], - ... } - ... ) - >>> lf.slice(1, 2).collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ y ┆ 3 ┆ 4 │ - │ z ┆ 5 ┆ 6 │ - └─────┴─────┴─────┘ - - ''' - def limit(self, n: int = ...) -> Self: - ''' - Get the first `n` rows. - - Alias for :func:`LazyFrame.head`. - - Parameters - ---------- - n - Number of rows to return. - - Notes - ----- - Consider using the :func:`fetch` operation if you only want to test your - query. The :func:`fetch` operation will load the first `n` rows at the scan - level, whereas the :func:`head`/:func:`limit` are applied at the end. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4, 5, 6], - ... "b": [7, 8, 9, 10, 11, 12], - ... } - ... ) - >>> lf.limit().collect() - shape: (5, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - │ 4 ┆ 10 │ - │ 5 ┆ 11 │ - └─────┴─────┘ - >>> lf.limit(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - └─────┴─────┘ - - ''' - def head(self, n: int = ...) -> Self: - ''' - Get the first `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Notes - ----- - Consider using the :func:`fetch` operation if you only want to test your - query. The :func:`fetch` operation will load the first `n` rows at the scan - level, whereas the :func:`head`/:func:`limit` are applied at the end. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4, 5, 6], - ... "b": [7, 8, 9, 10, 11, 12], - ... } - ... ) - >>> lf.head().collect() - shape: (5, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - │ 4 ┆ 10 │ - │ 5 ┆ 11 │ - └─────┴─────┘ - >>> lf.head(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - └─────┴─────┘ - - ''' - def tail(self, n: int = ...) -> Self: - ''' - Get the last `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4, 5, 6], - ... "b": [7, 8, 9, 10, 11, 12], - ... } - ... ) - >>> lf.tail().collect() - shape: (5, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - │ 4 ┆ 10 │ - │ 5 ┆ 11 │ - │ 6 ┆ 12 │ - └─────┴─────┘ - >>> lf.tail(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 5 ┆ 11 │ - │ 6 ┆ 12 │ - └─────┴─────┘ - - ''' - def last(self) -> Self: - ''' - Get the last row of the DataFrame. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> lf.last().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 5 ┆ 6 │ - └─────┴─────┘ - - ''' - def first(self) -> Self: - ''' - Get the first row of the DataFrame. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> lf.first().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 2 │ - └─────┴─────┘ - - ''' - def approx_n_unique(self) -> Self: - ''' - Approximate count of unique values. - - This is done using the HyperLogLog++ algorithm for cardinality estimation. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.approx_n_unique().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def approx_unique(self) -> Self: - """ - Approximate count of unique values. - - .. deprecated:: 0.18.12 - This method has been renamed to :func:`LazyFrame.approx_n_unique`. - - """ - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: - ''' - Add a column at index 0 that counts the rows. - - Parameters - ---------- - name - Name of the column to add. - offset - Start the row count at this offset. - - Warnings - -------- - This can have a negative effect on query performance. - This may, for instance, block predicate pushdown optimization. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> lf.with_row_count().collect() - shape: (3, 3) - ┌────────┬─────┬─────┐ - │ row_nr ┆ a ┆ b │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ i64 ┆ i64 │ - ╞════════╪═════╪═════╡ - │ 0 ┆ 1 ┆ 2 │ - │ 1 ┆ 3 ┆ 4 │ - │ 2 ┆ 5 ┆ 6 │ - └────────┴─────┴─────┘ - - ''' - def gather_every(self, n: int) -> Self: - ''' - Take every nth row in the LazyFrame and return as a new LazyFrame. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [5, 6, 7, 8], - ... } - ... ) - >>> lf.gather_every(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 5 │ - │ 3 ┆ 7 │ - └─────┴─────┘ - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: - ''' - Fill null values using the specified value or strategy. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - matches_supertype - Fill all matching supertypes of the fill `value` literal. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, None, 4], - ... "b": [0.5, 4, None, 13], - ... } - ... ) - >>> lf.fill_null(99).collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 99 ┆ 99.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - >>> lf.fill_null(strategy="forward").collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> lf.fill_null(strategy="max").collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> lf.fill_null(strategy="zero").collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 0 ┆ 0.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - ''' - def fill_nan(self, value: int | float | Expr | None) -> Self: - ''' - Fill floating point NaN values. - - Parameters - ---------- - value - Value to fill the NaN values with. - - Warnings - -------- - Note that floating point NaN (Not a Number) are not missing values! - To replace missing values, use :func:`fill_null` instead. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1.5, 2, float("nan"), 4], - ... "b": [0.5, 4, float("nan"), 13], - ... } - ... ) - >>> lf.fill_nan(99).collect() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪══════╡ - │ 1.5 ┆ 0.5 │ - │ 2.0 ┆ 4.0 │ - │ 99.0 ┆ 99.0 │ - │ 4.0 ┆ 13.0 │ - └──────┴──────┘ - - ''' - def std(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns in the LazyFrame to their standard deviation value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.std().collect() - shape: (1, 2) - ┌──────────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════════╪═════╡ - │ 1.290994 ┆ 0.5 │ - └──────────┴─────┘ - >>> lf.std(ddof=0).collect() - shape: (1, 2) - ┌──────────┬──────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════════╪══════════╡ - │ 1.118034 ┆ 0.433013 │ - └──────────┴──────────┘ - - ''' - def var(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns in the LazyFrame to their variance value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.var().collect() - shape: (1, 2) - ┌──────────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════════╪══════╡ - │ 1.666667 ┆ 0.25 │ - └──────────┴──────┘ - >>> lf.var(ddof=0).collect() - shape: (1, 2) - ┌──────┬────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪════════╡ - │ 1.25 ┆ 0.1875 │ - └──────┴────────┘ - - ''' - def max(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their maximum value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.max().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def min(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their minimum value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.min().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 1 │ - └─────┴─────┘ - - ''' - def sum(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their sum value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.sum().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 10 ┆ 5 │ - └─────┴─────┘ - - ''' - def mean(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their mean value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.mean().collect() - shape: (1, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════╡ - │ 2.5 ┆ 1.25 │ - └─────┴──────┘ - - ''' - def median(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their median value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.median().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 2.5 ┆ 1.0 │ - └─────┴─────┘ - - ''' - def null_count(self) -> Self: - ''' - Aggregate the columns in the LazyFrame as the sum of their null value count. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, None, 3], - ... "bar": [6, 7, None], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.null_count().collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ u32 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 1 ┆ 0 │ - └─────┴─────┴─────┘ - - ''' - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: - ''' - Aggregate the columns in the LazyFrame to their quantile value. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.quantile(0.7).collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 3.0 ┆ 1.0 │ - └─────┴─────┘ - - ''' - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: - ''' - Explode the DataFrame to long format by exploding the given columns. - - Parameters - ---------- - columns - Column names, expressions, or a selector defining them. The underlying - columns being exploded must be of List or Utf8 datatype. - *more_columns - Additional names of columns to explode, specified as positional arguments. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "letters": ["a", "a", "b", "c"], - ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], - ... } - ... ) - >>> lf.explode("numbers").collect() - shape: (8, 2) - ┌─────────┬─────────┐ - │ letters ┆ numbers │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════════╪═════════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ a ┆ 3 │ - │ b ┆ 4 │ - │ b ┆ 5 │ - │ c ┆ 6 │ - │ c ┆ 7 │ - │ c ┆ 8 │ - └─────────┴─────────┘ - - ''' - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: - ''' - Drop duplicate rows from this DataFrame. - - Parameters - ---------- - subset - Column name(s) or selector(s), to consider when identifying - duplicate rows. If set to `None` (default), use all columns. - keep : {\'first\', \'last\', \'any\', \'none\'} - Which of the duplicate rows to keep. - - * \'any\': Does not give any guarantee of which row is kept. - This allows more optimizations. - * \'none\': Don\'t keep duplicate rows. - * \'first\': Keep first unique row. - * \'last\': Keep last unique row. - maintain_order - Keep the same order as the original DataFrame. This is more expensive to - compute. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - Returns - ------- - LazyFrame - LazyFrame with unique rows. - - Warnings - -------- - This method will fail if there is a column of type `List` in the DataFrame or - subset. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3, 1], - ... "bar": ["a", "a", "a", "a"], - ... "ham": ["b", "b", "b", "b"], - ... } - ... ) - >>> lf.unique(maintain_order=True).collect() - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> lf.unique(keep="last", maintain_order=True).collect() - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - - ''' - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: - ''' - Drop all rows that contain null values. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - subset - Column name(s) for which null values are considered. - If set to `None` (default), use all columns. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, None, 8], - ... "ham": ["a", "b", None], - ... } - ... ) - - The default behavior of this method is to drop rows where any single - value of the row is null. - - >>> lf.drop_nulls().collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - This behaviour can be constrained to consider only a subset of columns, as - defined by name or with a selector. For example, dropping rows if there is - a null in any of the integer columns: - - >>> import polars.selectors as cs - >>> lf.drop_nulls(subset=cs.integer()).collect() - shape: (2, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ null │ - └─────┴─────┴──────┘ - - This method drops a row if any single value of the row is null. - - Below are some example snippets that show how you could drop null - values based on other conditions: - - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, None, None, None], - ... "b": [1, 2, None, 1], - ... "c": [1, None, None, 1], - ... } - ... ) - >>> lf.collect() - shape: (4, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪══════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ null ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴──────┴──────┘ - - Drop a row only if all values are null: - - >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() - shape: (3, 3) - ┌──────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪═════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴─────┴──────┘ - - ''' - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: - ''' - Unpivot a DataFrame from wide to long format. - - Optionally leaves identifiers set. - - This function is useful to massage a DataFrame into a format where one or more - columns are identifier variables (id_vars) while all other columns, considered - measured variables (value_vars), are "unpivoted" to the row axis leaving just - two non-identifier columns, \'variable\' and \'value\'. - - Parameters - ---------- - id_vars - Column(s) or selector(s) to use as identifier variables. - value_vars - Column(s) or selector(s) to use as values variables; if `value_vars` - is empty all columns that are not in `id_vars` will be used. - variable_name - Name to give to the `variable` column. Defaults to "variable" - value_name - Name to give to the `value` column. Defaults to "value" - streamable - Allow this node to run in the streaming engine. - If this runs in streaming, the output of the melt operation - will not have a stable ordering. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["x", "y", "z"], - ... "b": [1, 3, 5], - ... "c": [2, 4, 6], - ... } - ... ) - >>> import polars.selectors as cs - >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() - shape: (6, 3) - ┌─────┬──────────┬───────┐ - │ a ┆ variable ┆ value │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 │ - ╞═════╪══════════╪═══════╡ - │ x ┆ b ┆ 1 │ - │ y ┆ b ┆ 3 │ - │ z ┆ b ┆ 5 │ - │ x ┆ c ┆ 2 │ - │ y ┆ c ┆ 4 │ - │ z ┆ c ┆ 6 │ - └─────┴──────────┴───────┘ - - ''' - def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: - ''' - Apply a custom function. - - It is important that the function returns a Polars DataFrame. - - Parameters - ---------- - function - Lambda/ function to apply. - predicate_pushdown - Allow predicate pushdown optimization to pass this node. - projection_pushdown - Allow projection pushdown optimization to pass this node. - slice_pushdown - Allow slice pushdown optimization to pass this node. - no_optimizations - Turn off all optimizations past this point. - schema - Output schema of the function, if set to `None` we assume that the schema - will remain unchanged by the applied function. - validate_output_schema - It is paramount that polars\' schema is correct. This flag will ensure that - the output schema of this function will be checked with the expected schema. - Setting this to `False` will not do this check, but may lead to hard to - debug bugs. - streamable - Whether the function that is given is eligible to be running with the - streaming engine. That means that the function must produce the same result - when it is executed in batches or when it is be executed on the full - dataset. - - Warnings - -------- - The `schema` of a `LazyFrame` must always be correct. It is up to the caller - of this function to ensure that this invariant is upheld. - - It is important that the optimization flags are correct. If the custom function - for instance does an aggregation of a column, `predicate_pushdown` should not - be allowed, as this prunes rows and will influence your aggregation results. - - Examples - -------- - >>> lf = ( # doctest: +SKIP - ... pl.LazyFrame( - ... { - ... "a": pl.int_range(-100_000, 0, eager=True), - ... "b": pl.int_range(0, 100_000, eager=True), - ... } - ... ) - ... .map_batches(lambda x: 2 * x, streamable=True) - ... .collect(streaming=True) - ... ) - shape: (100_000, 2) - ┌─────────┬────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════════╪════════╡ - │ -200000 ┆ 0 │ - │ -199998 ┆ 2 │ - │ -199996 ┆ 4 │ - │ -199994 ┆ 6 │ - │ … ┆ … │ - │ -8 ┆ 199992 │ - │ -6 ┆ 199994 │ - │ -4 ┆ 199996 │ - │ -2 ┆ 199998 │ - └─────────┴────────┘ - - ''' - def interpolate(self) -> Self: - ''' - Interpolate intermediate values. The interpolation method is linear. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, None, 9, 10], - ... "bar": [6, 7, 9, None], - ... "baz": [1, None, None, 9], - ... } - ... ) - >>> lf.interpolate().collect() - shape: (4, 3) - ┌──────┬──────┬──────────┐ - │ foo ┆ bar ┆ baz │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 │ - ╞══════╪══════╪══════════╡ - │ 1.0 ┆ 6.0 ┆ 1.0 │ - │ 5.0 ┆ 7.0 ┆ 3.666667 │ - │ 9.0 ┆ 9.0 ┆ 6.333333 │ - │ 10.0 ┆ null ┆ 9.0 │ - └──────┴──────┴──────────┘ - - ''' - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: - ''' - Decompose struct columns into separate columns for each of their fields. - - The new columns will be inserted into the DataFrame at the location of the - struct column. - - Parameters - ---------- - columns - Name of the struct column(s) that should be unnested. - *more_columns - Additional columns to unnest, specified as positional arguments. - - Examples - -------- - >>> df = pl.LazyFrame( - ... { - ... "before": ["foo", "bar"], - ... "t_a": [1, 2], - ... "t_b": ["a", "b"], - ... "t_c": [True, None], - ... "t_d": [[1, 2], [3]], - ... "after": ["baz", "womp"], - ... } - ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") - >>> df.collect() - shape: (2, 3) - ┌────────┬─────────────────────┬───────┐ - │ before ┆ t_struct ┆ after │ - │ --- ┆ --- ┆ --- │ - │ str ┆ struct[4] ┆ str │ - ╞════════╪═════════════════════╪═══════╡ - │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ - │ bar ┆ {2,"b",null,[3]} ┆ womp │ - └────────┴─────────────────────┴───────┘ - >>> df.unnest("t_struct").collect() - shape: (2, 6) - ┌────────┬─────┬─────┬──────┬───────────┬───────┐ - │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ - ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ - │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ - │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ - └────────┴─────┴─────┴──────┴───────────┴───────┘ - - ''' - def merge_sorted(self, other: LazyFrame, key: str) -> Self: - ''' - Take two sorted DataFrames and merge them by the sorted key. - - The output of this operation will also be sorted. - It is the callers responsibility that the frames are sorted - by that key otherwise the output will not make sense. - - The schemas of both LazyFrames must be equal. - - Parameters - ---------- - other - Other DataFrame that must be merged - key - Key that is sorted. - - Examples - -------- - >>> df0 = pl.LazyFrame( - ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} - ... ).sort("age") - >>> df0.collect() - shape: (3, 2) - ┌───────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═══════╪═════╡ - │ bob ┆ 18 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └───────┴─────┘ - >>> df1 = pl.LazyFrame( - ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} - ... ).sort("age") - >>> df1.collect() - shape: (4, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - └────────┴─────┘ - >>> df0.merge_sorted(df1, key="age").collect() - shape: (7, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ bob ┆ 18 │ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └────────┴─────┘ - ''' - def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: - """ - Indicate that one or multiple columns are sorted. - - Parameters - ---------- - column - Columns that are sorted - more_columns - Additional columns that are sorted, specified as positional arguments. - descending - Whether the columns are sorted in descending order. - """ - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: - ''' - Update the values in this `LazyFrame` with the non-null values in `other`. - - Parameters - ---------- - other - LazyFrame that will be used to update the values - on - Column names that will be joined on; if given `None` the implicit row - index is used as a join key instead. - left_on - Join column(s) of the left DataFrame. - right_on - Join column(s) of the right DataFrame. - how : {\'left\', \'inner\', \'outer\'} - * \'left\' will keep all rows from the left table; rows may be duplicated - if multiple rows in the right frame match the left row\'s key. - * \'inner\' keeps only those rows where the key exists in both frames. - * \'outer\' will update existing rows where the key matches while also - adding any new rows contained in the given frame. - include_nulls - If True, null values from the right DataFrame will be used to update the - left DataFrame. - - Notes - ----- - This is syntactic sugar for a left/inner join, with an optional coalesce when - `include_nulls = False`. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "A": [1, 2, 3, 4], - ... "B": [400, 500, 600, 700], - ... } - ... ) - >>> lf.collect() - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 400 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - >>> new_lf = pl.LazyFrame( - ... { - ... "B": [-66, None, -99], - ... "C": [5, 3, 1], - ... } - ... ) - - Update `df` values with the non-null values in `new_df`, by row index: - - >>> lf.update(new_lf).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, by row index, - but only keeping those rows that are common to both frames: - - >>> lf.update(new_lf, how="inner").collect() - shape: (3, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() - shape: (5, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴─────┘ - - Update `df` values including null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> lf.update( - ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True - ... ).collect() - shape: (5, 2) - ┌─────┬──────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ null │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴──────┘ - - ''' - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: - """ - Start a group by operation. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.group_by`. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - """ - def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - """ - def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.9 - This method has been renamed to :func:`LazyFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - """ - def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.group_by_dynamic`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - include_boundaries - Add the lower and upper bound of the window to the "_lower_bound" and - "_upper_bound" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - ''' - def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: - """ - Apply a custom function. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.map_batches`. - - Parameters - ---------- - function - Lambda/ function to apply. - predicate_pushdown - Allow predicate pushdown optimization to pass this node. - projection_pushdown - Allow projection pushdown optimization to pass this node. - slice_pushdown - Allow slice pushdown optimization to pass this node. - no_optimizations - Turn off all optimizations past this point. - schema - Output schema of the function, if set to `None` we assume that the schema - will remain unchanged by the applied function. - validate_output_schema - It is paramount that polars' schema is correct. This flag will ensure that - the output schema of this function will be checked with the expected schema. - Setting this to `False` will not do this check, but may lead to hard to - debug bugs. - streamable - Whether the function that is given is eligible to be running with the - streaming engine. That means that the function must produce the same result - when it is executed in batches or when it is be executed on the full - dataset. - - """ - def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - fill None values with the result of this expression. - n - Number of places to shift (may be negative). - - """ - def take_every(self, n: int) -> Self: - """ - Take every nth row in the LazyFrame and return as a new LazyFrame. - - .. deprecated:: 0.19.0 - This method has been renamed to :meth:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - @property - def columns(self): ... - @property - def dtypes(self): ... - @property - def schema(self): ... - @property - def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..c1557d8 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/lazyframe/frame.pyi @@ -0,0 +1,4174 @@ +#: version 0.20.1 +import P +import np +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use `pl.scan_csv` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use `pl.scan_parquet` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use `pl.scan_ipc` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use `pl.scan_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to `True`. + If this is set to `True` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Materialize this LazyFrame into a DataFrame. + + By default, all query optimizations are enabled. Individual optimizations may + be disabled by setting the corresponding parameter to `False`. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + no_optimization + Turn off (certain) optimizations. + streaming + Process the query in batches to handle larger-than-memory data. + If set to `False` (default), the entire query is processed in a single + batch. + + .. warning:: + This functionality is currently in an alpha state. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + + Returns + ------- + DataFrame + + See Also + -------- + fetch: Run the query on the first `n` rows only for debugging purposes. + explain : Print the query plan that is evaluated with collect. + profile : Collect the LazyFrame and time each node in the computation graph. + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.Config.set_streaming_chunk_size : Set the size of streaming batches. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + Collect in streaming mode + + >>> lf.group_by("a").agg(pl.all().sum()).collect( + ... streaming=True + ... ) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + DataFrame directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + ... + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a Parquet file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an IPC file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a CSV file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the + separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def sink_ndjson(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_json("out.json") # doctest: +SKIP + + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that `fetch` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if `n_rows` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this LazyFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") > 1).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> lf.filter( + ... pl.col("foo") == 1, + ... pl.col("ham") == "a", + ... ).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> lf.filter(foo=1, ham="a").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Setting this to `True` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `dynamic_group_by` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.rolling(index_column="dt", period="2d") + ... .agg( + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ) + ... .collect() + ... ) + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\', \'outer_coalesce\'} + Join strategy. + + * *inner* + Returns rows that have matching values in both tables + * *left* + Returns all rows from the left table, and the matched rows from the + right table + * *outer* + Returns all rows when there is a match in either left or right table + * *outer_coalesce* + Same as \'outer\', but coalesces the key columns + * *cross* + Returns the cartisian product of rows from both tables + * *semi* + Filter rows that have a match in the right table. + * *anti* + Filter rows that not have a match in the right table. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + join_nulls + Join on null values. By default null values will never produce matches. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 5) + ┌──────┬──────┬──────┬───────┬───────────┐ + │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞══════╪══════╪══════╪═══════╪═══════════╡ + │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │ + │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │ + │ null ┆ null ┆ null ┆ z ┆ d │ + │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │ + └──────┴──────┴──────┴───────┴───────────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another DataFrame: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context( + ... train_lf.select(pl.all().name.suffix("_train")) + ... ).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the DataFrame. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the DataFrame. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), + polars will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.shift().collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> lf.shift(-2).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> lf.shift(-2, fill_value=100).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.gather_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill `value` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the DataFrame to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this DataFrame. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The `schema` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, `predicate_pushdown` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the DataFrame at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + .. warning:: + This functionality is experimental and may change without it being + considered a breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on; if given `None` the implicit row + index is used as a join key instead. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + include_nulls + If True, null values from the right DataFrame will be used to update the + left DataFrame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> lf.collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_lf = pl.LazyFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> lf.update(new_lf).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> lf.update(new_lf, how="inner").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update( + ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... ).collect() + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> Self: + """ + Take every nth row in the LazyFrame and return as a new LazyFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/series/series deleted file mode 100644 index 4a40006..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/series/series +++ /dev/null @@ -1,4988 +0,0 @@ -import np as np -import pa as pa -import pd as pd -from builtins import PySeries -from datetime import date, datetime, timedelta -from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Object as Object, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown, Utf8 as Utf8 -from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat -from polars.exceptions import ShapeError as ShapeError -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence - -TYPE_CHECKING: bool -_PYARROW_AVAILABLE: bool - -class Series: - _s: _ClassVar[None] = ... - _accessors: _ClassVar[set] = ... - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array) -> Self: - """Construct a Series from an Arrow Array.""" - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: - """Construct a Series from a pandas Series or DatetimeIndex.""" - def _get_ptr(self) -> tuple[int, int, int]: - """ - Get a pointer to the start of the values buffer of a numeric Series. - - This will raise an error if the `Series` contains multiple chunks. - - This will return the offset, length and the pointer itself. - - """ - def __bool__(self) -> NoReturn: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Series: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... - def __eq__(self, other: Any) -> Series | Expr: ... - def __ne__(self, other: Any) -> Series | Expr: ... - def __gt__(self, other: Any) -> Series | Expr: ... - def __lt__(self, other: Any) -> Series | Expr: ... - def __ge__(self, other: Any) -> Series | Expr: ... - def __le__(self, other: Any) -> Series | Expr: ... - def le(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series <= other`.""" - def lt(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series < other`.""" - def eq(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series == other`.""" - def eq_missing(self, other: Any) -> Self | Expr: - ''' - Method equivalent of equality operator `series == other` where `None == None`. - - This differs from the standard `ne` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - See Also - -------- - ne_missing - eq - - Examples - -------- - >>> s1 = pl.Series("a", [333, 200, None]) - >>> s2 = pl.Series("a", [100, 200, None]) - >>> s1.eq(s2) - shape: (3,) - Series: \'a\' [bool] - [ - false - true - null - ] - >>> s1.eq_missing(s2) - shape: (3,) - Series: \'a\' [bool] - [ - false - true - true - ] - - ''' - def ne(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series != other`.""" - def ne_missing(self, other: Any) -> Self | Expr: - ''' - Method equivalent of equality operator `series != other` where `None == None`. - - This differs from the standard `ne` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - See Also - -------- - eq_missing - ne - - Examples - -------- - >>> s1 = pl.Series("a", [333, 200, None]) - >>> s2 = pl.Series("a", [100, 200, None]) - >>> s1.ne(s2) - shape: (3,) - Series: \'a\' [bool] - [ - true - false - null - ] - >>> s1.ne_missing(s2) - shape: (3,) - Series: \'a\' [bool] - [ - true - false - false - ] - - ''' - def ge(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series >= other`.""" - def gt(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series > other`.""" - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - def __add__(self, other: Any) -> Self | DataFrame | Expr: ... - def __sub__(self, other: Any) -> Self | Expr: ... - def __truediv__(self, other: Any) -> Series | Expr: ... - def __floordiv__(self, other: Any) -> Series | Expr: ... - def __invert__(self) -> Series: ... - def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... - def __mod__(self, other: Any) -> Series | Expr: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: - """ - Numpy __array__ interface protocol. - - Ensures that `np.asarray(pl.Series(..))` works as expected, see - https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. - """ - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: - """Numpy universal functions.""" - def __column_consortium_standard__(self) -> Any: - """ - Provide entry point to the Consortium DataFrame Standard API. - - This is developed and maintained outside of polars. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. - """ - def _repr_html_(self) -> str: - """Format output data in HTML for display in Jupyter Notebooks.""" - def item(self, index: int | None = ...) -> Any: - ''' - Return the Series as a scalar, or return the element at the given index. - - If no index is provided, this is equivalent to `s[0]`, with a check - that the shape is (1,). With an index, this is equivalent to `s[index]`. - - Examples - -------- - >>> s1 = pl.Series("a", [1]) - >>> s1.item() - 1 - >>> s2 = pl.Series("a", [9, 8, 7]) - >>> s2.cum_sum().item(-1) - 24 - - ''' - def estimated_size(self, unit: SizeUnit = ...) -> int | float: - ''' - Return an estimation of the total (heap) allocated size of the Series. - - Estimated size is given in the specified unit (bytes by default). - - This estimation is the sum of the size of its buffers, validity, including - nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the - size of 2 arrays is not the sum of the sizes computed from this function. In - particular, [`StructArray`]\'s size is an upper bound. - - When an array is sliced, its allocated size remains constant because the buffer - unchanged. However, this function will yield a smaller number. This is because - this function returns the visible size of the buffer, not its total capacity. - - FFI buffers are included in this estimation. - - Parameters - ---------- - unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} - Scale the returned size to the given unit. - - Examples - -------- - >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) - >>> s.estimated_size() - 4000000 - >>> s.estimated_size("mb") - 3.814697265625 - - ''' - def sqrt(self) -> Series: - """ - Compute the square root of the elements. - - Syntactic sugar for - - >>> pl.Series([1, 2]) ** 0.5 - shape: (2,) - Series: '' [f64] - [ - 1.0 - 1.414214 - ] - - """ - def cbrt(self) -> Series: - """ - Compute the cube root of the elements. - - Optimization for - - >>> pl.Series([1, 2]) ** (1.0 / 3) - shape: (2,) - Series: '' [f64] - [ - 1.0 - 1.259921 - ] - - """ - def any(self) -> bool | None: - """ - Return whether any of the values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is `None`. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - bool or None - - Examples - -------- - >>> pl.Series([True, False]).any() - True - >>> pl.Series([False, False]).any() - False - >>> pl.Series([None, False]).any() - False - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None - - """ - def all(self) -> bool | None: - """ - Return whether all values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is `None`. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - bool or None - - Examples - -------- - >>> pl.Series([True, True]).all() - True - >>> pl.Series([False, True]).all() - False - >>> pl.Series([None, True]).all() - True - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None - - """ - def log(self, base: float = ...) -> Series: - """Compute the logarithm to a given base.""" - def log1p(self) -> Series: - """Compute the natural logarithm of the input array plus one, element-wise.""" - def log10(self) -> Series: - """Compute the base 10 logarithm of the input array, element-wise.""" - def exp(self) -> Series: - """Compute the exponential, element-wise.""" - def drop_nulls(self) -> Series: - ''' - Drop all null values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nans - - Notes - ----- - A null value is not the same as a NaN value. - To drop NaN values, use :func:`drop_nans`. - - Examples - -------- - >>> s = pl.Series([1.0, None, 3.0, float("nan")]) - >>> s.drop_nulls() - shape: (3,) - Series: \'\' [f64] - [ - 1.0 - 3.0 - NaN - ] - - ''' - def drop_nans(self) -> Series: - ''' - Drop all floating point NaN values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nulls - - Notes - ----- - A NaN value is not the same as a null value. - To drop null values, use :func:`drop_nulls`. - - Examples - -------- - >>> s = pl.Series([1.0, None, 3.0, float("nan")]) - >>> s.drop_nans() - shape: (3,) - Series: \'\' [f64] - [ - 1.0 - null - 3.0 - ] - - ''' - def to_frame(self, name: str | None = ...) -> DataFrame: - ''' - Cast this Series to a DataFrame. - - Parameters - ---------- - name - optionally name/rename the Series column in the new DataFrame. - - Examples - -------- - >>> s = pl.Series("a", [123, 456]) - >>> df = s.to_frame() - >>> df - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 123 │ - │ 456 │ - └─────┘ - - >>> df = s.to_frame("xyz") - >>> df - shape: (2, 1) - ┌─────┐ - │ xyz │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 123 │ - │ 456 │ - └─────┘ - - ''' - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: - ''' - Quick summary statistics of a Series. - - Series with mixed datatypes will return summary statistics for the datatype of - the first value. - - Parameters - ---------- - percentiles - One or more percentiles to include in the summary statistics (if the - Series has a numeric dtype). All values must be in the range `[0, 1]`. - - Notes - ----- - The median is included by default as the 50% percentile. - - Returns - ------- - DataFrame - Mapping with summary statistics of a Series. - - Examples - -------- - >>> series_num = pl.Series([1, 2, 3, 4, 5]) - >>> series_num.describe() - shape: (9, 2) - ┌────────────┬──────────┐ - │ statistic ┆ value │ - │ --- ┆ --- │ - │ str ┆ f64 │ - ╞════════════╪══════════╡ - │ count ┆ 5.0 │ - │ null_count ┆ 0.0 │ - │ mean ┆ 3.0 │ - │ std ┆ 1.581139 │ - │ min ┆ 1.0 │ - │ 25% ┆ 2.0 │ - │ 50% ┆ 3.0 │ - │ 75% ┆ 4.0 │ - │ max ┆ 5.0 │ - └────────────┴──────────┘ - - >>> series_str = pl.Series(["a", "a", None, "b", "c"]) - >>> series_str.describe() - shape: (3, 2) - ┌────────────┬───────┐ - │ statistic ┆ value │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════════╪═══════╡ - │ count ┆ 5 │ - │ null_count ┆ 1 │ - │ unique ┆ 4 │ - └────────────┴───────┘ - - ''' - def sum(self) -> int | float: - ''' - Reduce this Series to the sum value. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.sum() - 6 - - ''' - def mean(self) -> int | float | None: - ''' - Reduce this Series to the mean value. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.mean() - 2.0 - - ''' - def product(self) -> int | float: - """Reduce this Series to the product value.""" - def pow(self, exponent: int | float | None | Series) -> Series: - ''' - Raise to the power of the given exponent. - - Parameters - ---------- - exponent - The exponent. Accepts Series input. - - Examples - -------- - >>> s = pl.Series("foo", [1, 2, 3, 4]) - >>> s.pow(3) - shape: (4,) - Series: \'foo\' [f64] - [ - 1.0 - 8.0 - 27.0 - 64.0 - ] - - ''' - def min(self) -> PythonLiteral | None: - ''' - Get the minimal value in this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.min() - 1 - - ''' - def max(self) -> PythonLiteral | None: - ''' - Get the maximum value in this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.max() - 3 - - ''' - def nan_max(self) -> int | float | date | datetime | timedelta | str: - """ - Get maximum value, but propagate/poison encountered NaN values. - - This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - """ - def nan_min(self) -> int | float | date | datetime | timedelta | str: - """ - Get minimum value, but propagate/poison encountered NaN values. - - This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - """ - def std(self, ddof: int = ...) -> float | None: - ''' - Get the standard deviation of this Series. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.std() - 1.0 - - ''' - def var(self, ddof: int = ...) -> float | None: - ''' - Get variance of this Series. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.var() - 1.0 - - ''' - def median(self) -> float | None: - ''' - Get the median of this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.median() - 2.0 - - ''' - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: - ''' - Get the quantile value of this Series. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.quantile(0.5) - 2.0 - - ''' - def to_dummies(self, separator: str = ...) -> DataFrame: - ''' - Get dummy/indicator variables. - - Parameters - ---------- - separator - Separator/delimiter used when generating column names. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.to_dummies() - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a_1 ┆ a_2 ┆ a_3 │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 0 ┆ 0 │ - │ 0 ┆ 1 ┆ 0 │ - │ 0 ┆ 0 ┆ 1 │ - └─────┴─────┴─────┘ - - ''' - def cut(self, breaks: Sequence[float]) -> Series | DataFrame: - ''' - Bin continuous values into discrete categories. - - Parameters - ---------- - breaks - List of unique cut points. - labels - Names of the categories. The number of labels must be equal to the number - of cut points plus one. - break_point_label - Name of the breakpoint column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - category_label - Name of the category column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - left_closed - Set the intervals to be left-closed instead of right-closed. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - as_series - If set to `False`, return a DataFrame containing the original values, - the breakpoints, and the categories. - - .. deprecated:: 0.19.0 - This parameter will be removed. The same behavior can be achieved by - setting `include_breaks=True`, unnesting the resulting struct Series, - and adding the result to the original Series. - - Returns - ------- - Series - Series of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise a Series of data type :class:`Struct`. - - See Also - -------- - qcut - - Examples - -------- - Divide the column into three categories. - - >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) - >>> s.cut([-1, 1], labels=["a", "b", "c"]) - shape: (5,) - Series: \'foo\' [cat] - [ - "a" - "a" - "b" - "b" - "c" - ] - - Create a DataFrame with the breakpoint and category for each value. - - >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") - >>> s.to_frame().with_columns(cut).unnest("cut") - shape: (5, 3) - ┌─────┬─────────────┬────────────┐ - │ foo ┆ break_point ┆ category │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪═════════════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴─────────────┴────────────┘ - - ''' - def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: - ''' - Bin continuous values into discrete categories based on their quantiles. - - Parameters - ---------- - quantiles - Either a list of quantile probabilities between 0 and 1 or a positive - integer determining the number of bins with uniform probability. - labels - Names of the categories. The number of labels must be equal to the number - of cut points plus one. - left_closed - Set the intervals to be left-closed instead of right-closed. - allow_duplicates - If set to `True`, duplicates in the resulting quantiles are dropped, - rather than raising a `DuplicateError`. This can happen even with unique - probabilities, depending on the data. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - break_point_label - Name of the breakpoint column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - category_label - Name of the category column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - as_series - If set to `False`, return a DataFrame containing the original values, - the breakpoints, and the categories. - - .. deprecated:: 0.19.0 - This parameter will be removed. The same behavior can be achieved by - setting `include_breaks=True`, unnesting the resulting struct Series, - and adding the result to the original Series. - - Returns - ------- - Series - Series of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise a Series of data type :class:`Struct`. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - See Also - -------- - cut - - Examples - -------- - Divide a column into three categories according to pre-defined quantile - probabilities. - - >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) - >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) - shape: (5,) - Series: \'foo\' [cat] - [ - "a" - "a" - "b" - "b" - "c" - ] - - Divide a column into two categories using uniform quantile probabilities. - - >>> s.qcut(2, labels=["low", "high"], left_closed=True) - shape: (5,) - Series: \'foo\' [cat] - [ - "low" - "low" - "high" - "high" - "high" - ] - - Create a DataFrame with the breakpoint and category for each value. - - >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") - >>> s.to_frame().with_columns(cut).unnest("cut") - shape: (5, 3) - ┌─────┬─────────────┬────────────┐ - │ foo ┆ break_point ┆ category │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪═════════════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴─────────────┴────────────┘ - - ''' - def rle(self) -> Series: - ''' - Get the lengths of runs of identical values. - - Returns - ------- - Series - Series of data type :class:`Struct` with Fields "lengths" and "values". - - Examples - -------- - >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) - >>> s.rle().struct.unnest() - shape: (6, 2) - ┌─────────┬────────┐ - │ lengths ┆ values │ - │ --- ┆ --- │ - │ i32 ┆ i64 │ - ╞═════════╪════════╡ - │ 2 ┆ 1 │ - │ 1 ┆ 2 │ - │ 1 ┆ 1 │ - │ 1 ┆ null │ - │ 1 ┆ 1 │ - │ 2 ┆ 3 │ - └─────────┴────────┘ - ''' - def rle_id(self) -> Series: - ''' - Map values to run IDs. - - Similar to RLE, but it maps each value to an ID corresponding to the run into - which it falls. This is especially useful when you want to define groups by - runs of identical values rather than the values themselves. - - Returns - ------- - Series - - See Also - -------- - rle - - Examples - -------- - >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) - >>> s.rle_id() - shape: (8,) - Series: \'s\' [u32] - [ - 0 - 0 - 1 - 2 - 3 - 4 - 5 - 5 - ] - ''' - def hist(self, bins: list[float] | None = ...) -> DataFrame: - ''' - Bin values into buckets and count their occurrences. - - Parameters - ---------- - bins - Discretizations to make. - If None given, we determine the boundaries based on the data. - bin_count - If no bins provided, this will be used to determine - the distance of the bins - - Returns - ------- - DataFrame - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Examples - -------- - >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) - >>> a.hist(bin_count=4) - shape: (5, 3) - ┌─────────────┬─────────────┬─────────┐ - │ break_point ┆ category ┆ a_count │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ cat ┆ u32 │ - ╞═════════════╪═════════════╪═════════╡ - │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ - │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ - │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ - │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ - │ inf ┆ (6.75, inf] ┆ 2 │ - └─────────────┴─────────────┴─────────┘ - - ''' - def value_counts(self) -> DataFrame: - ''' - Count the occurrences of unique values. - - Parameters - ---------- - sort - Sort the output by count in descending order. - If set to `False` (default), the order of the output is random. - parallel - Execute the computation in parallel. - - .. note:: - This option should likely not be enabled in a group by context, - as the computation is already parallelized per group. - - Returns - ------- - DataFrame - Mapping of unique values to their count. - - Examples - -------- - >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) - >>> s.value_counts() # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌───────┬────────┐ - │ color ┆ counts │ - │ --- ┆ --- │ - │ str ┆ u32 │ - ╞═══════╪════════╡ - │ red ┆ 2 │ - │ green ┆ 1 │ - │ blue ┆ 3 │ - └───────┴────────┘ - - Sort the output by count. - - shape: (3, 2) - ┌───────┬────────┐ - │ color ┆ counts │ - │ --- ┆ --- │ - │ str ┆ u32 │ - ╞═══════╪════════╡ - │ blue ┆ 3 │ - │ red ┆ 2 │ - │ green ┆ 1 │ - └───────┴────────┘ - - ''' - def unique_counts(self) -> Series: - ''' - Return a count of the unique values in the order of appearance. - - Examples - -------- - >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) - >>> s.unique_counts() - shape: (3,) - Series: \'id\' [u32] - [ - 1 - 2 - 3 - ] - - ''' - def entropy(self, base: float = ...) -> float | None: - """ - Computes the entropy. - - Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. - - Parameters - ---------- - base - Given base, defaults to `e` - normalize - Normalize pk if it doesn't sum to 1. - - Examples - -------- - >>> a = pl.Series([0.99, 0.005, 0.005]) - >>> a.entropy(normalize=True) - 0.06293300616044681 - >>> b = pl.Series([0.65, 0.10, 0.25]) - >>> b.entropy(normalize=True) - 0.8568409950394724 - - """ - def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: - ''' - Run an expression over a sliding window that increases `1` slot every iteration. - - Parameters - ---------- - expr - Expression to evaluate - min_periods - Number of valid values there should be in the window before the expression - is evaluated. valid values = `length - null_count` - parallel - Run in parallel. Don\'t do this in a group by or another operation that - already has much parallelization. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - This can be really slow as it can have `O(n^2)` complexity. Don\'t use this - for operations that visit all elements. - - Examples - -------- - >>> s = pl.Series("values", [1, 2, 3, 4, 5]) - >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) - shape: (5,) - Series: \'values\' [f64] - [ - 0.0 - -3.0 - -8.0 - -15.0 - -24.0 - ] - - ''' - def alias(self, name: str) -> Series: - ''' - Rename the series. - - Parameters - ---------- - name - The new name. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.alias("b") - shape: (3,) - Series: \'b\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def rename(self, name: str) -> Series: - ''' - Rename this Series. - - Alias for :func:`Series.alias`. - - Parameters - ---------- - name - New name. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.rename("b") - shape: (3,) - Series: \'b\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def chunk_lengths(self) -> list[int]: - ''' - Get the length of each individual chunk. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("a", [4, 5, 6]) - - Concatenate Series with rechunk = True - - >>> pl.concat([s, s2]).chunk_lengths() - [6] - - Concatenate Series with rechunk = False - - >>> pl.concat([s, s2], rechunk=False).chunk_lengths() - [3, 3] - - ''' - def n_chunks(self) -> int: - ''' - Get the number of chunks that this Series contains. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.n_chunks() - 1 - >>> s2 = pl.Series("a", [4, 5, 6]) - - Concatenate Series with rechunk = True - - >>> pl.concat([s, s2]).n_chunks() - 1 - - Concatenate Series with rechunk = False - - >>> pl.concat([s, s2], rechunk=False).n_chunks() - 2 - - ''' - def cum_max(self) -> Series: - ''' - Get an array with the cumulative max computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Examples - -------- - >>> s = pl.Series("s", [3, 5, 1]) - >>> s.cum_max() - shape: (3,) - Series: \'s\' [i64] - [ - 3 - 5 - 5 - ] - - ''' - def cum_min(self) -> Series: - ''' - Get an array with the cumulative min computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Examples - -------- - >>> s = pl.Series("s", [1, 2, 3]) - >>> s.cum_min() - shape: (3,) - Series: \'s\' [i64] - [ - 1 - 1 - 1 - ] - - ''' - def cum_prod(self) -> Series: - ''' - Get an array with the cumulative product computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.cum_prod() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 6 - ] - - ''' - def cum_sum(self) -> Series: - ''' - Get an array with the cumulative sum computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.cum_sum() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 3 - 6 - ] - - ''' - def slice(self, offset: int, length: int | None = ...) -> Series: - ''' - Get a slice of this Series. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4]) - >>> s.slice(1, 2) - shape: (2,) - Series: \'a\' [i64] - [ - 2 - 3 - ] - - ''' - def append(self, other: Series) -> Self: - ''' - Append a Series to this one. - - Parameters - ---------- - other - Series to append. - append_chunks - .. deprecated:: 0.18.8 - This argument will be removed and `append` will change to always - behave like `append_chunks=True` (the previous default). For the - behavior of `append_chunks=False`, use `Series.extend`. - - If set to `True` the append operation will add the chunks from `other` to - self. This is super cheap. - - If set to `False` the append operation will do the same as - `DataFrame.extend` which extends the memory backed by this `Series` with - the values from `other`. - - Different from `append chunks`, `extend` appends the data from `other` to - the underlying memory locations and thus may cause a reallocation (which are - expensive). - - If this does not cause a reallocation, the resulting data structure will not - have any extra chunks and thus will yield faster queries. - - Prefer `extend` over `append_chunks` when you want to do a query after a - single append. For instance during online operations where you add `n` rows - and rerun a query. - - Prefer `append_chunks` over `extend` when you want to append many times - before doing a query. For instance when you read in multiple files and when - to store them in a single `Series`. In the latter case, finish the sequence - of `append_chunks` operations with a `rechunk`. - - Warnings - -------- - This method modifies the series in-place. The series is returned for - convenience only. - - See Also - -------- - extend - - Examples - -------- - >>> a = pl.Series("a", [1, 2, 3]) - >>> b = pl.Series("b", [4, 5]) - >>> a.append(b) - shape: (5,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - 5 - ] - - The resulting series will consist of multiple chunks. - - >>> a.n_chunks() - 2 - - ''' - def extend(self, other: Series) -> Self: - ''' - Extend the memory backed by this Series with the values from another. - - Different from `append`, which adds the chunks from `other` to the chunks of - this series, `extend` appends the data from `other` to the underlying memory - locations and thus may cause a reallocation (which is expensive). - - If this does `not` cause a reallocation, the resulting data structure will not - have any extra chunks and thus will yield faster queries. - - Prefer `extend` over `append` when you want to do a query after a single - append. For instance, during online operations where you add `n` rows - and rerun a query. - - Prefer `append` over `extend` when you want to append many times - before doing a query. For instance, when you read in multiple files and want - to store them in a single `Series`. In the latter case, finish the sequence - of `append` operations with a `rechunk`. - - Parameters - ---------- - other - Series to extend the series with. - - Warnings - -------- - This method modifies the series in-place. The series is returned for - convenience only. - - See Also - -------- - append - - Examples - -------- - >>> a = pl.Series("a", [1, 2, 3]) - >>> b = pl.Series("b", [4, 5]) - >>> a.extend(b) - shape: (5,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - 5 - ] - - The resulting series will consist of a single chunk. - - >>> a.n_chunks() - 1 - - ''' - def filter(self, predicate: Series | list[bool]) -> Self: - ''' - Filter elements by a boolean mask. - - The original order of the remaining elements is preserved. - - Parameters - ---------- - predicate - Boolean mask. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> mask = pl.Series("", [True, False, True]) - >>> s.filter(mask) - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 3 - ] - - ''' - def head(self, n: int = ...) -> Series: - ''' - Get the first `n` elements. - - Parameters - ---------- - n - Number of elements to return. If a negative value is passed, return all - elements except the last `abs(n)`. - - See Also - -------- - tail, slice - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.head(3) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - Pass a negative value to get all rows `except` the last `abs(n)`. - - >>> s.head(-3) - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 2 - ] - - ''' - def tail(self, n: int = ...) -> Series: - ''' - Get the last `n` elements. - - Parameters - ---------- - n - Number of elements to return. If a negative value is passed, return all - elements except the first `abs(n)`. - - See Also - -------- - head, slice - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.tail(3) - shape: (3,) - Series: \'a\' [i64] - [ - 3 - 4 - 5 - ] - - Pass a negative value to get all rows `except` the first `abs(n)`. - - >>> s.tail(-3) - shape: (2,) - Series: \'a\' [i64] - [ - 4 - 5 - ] - - ''' - def limit(self, n: int = ...) -> Series: - """ - Get the first `n` elements. - - Alias for :func:`Series.head`. - - Parameters - ---------- - n - Number of elements to return. If a negative value is passed, return all - elements except the last `abs(n)`. - - See Also - -------- - head - - """ - def gather_every(self, n: int) -> Series: - ''' - Take every nth value in the Series and return as new Series. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4]) - >>> s.gather_every(2) - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 3 - ] - - ''' - def sort(self) -> Self: - ''' - Sort this Series. - - Parameters - ---------- - descending - Sort in descending order. - in_place - Sort in-place. - - Examples - -------- - >>> s = pl.Series("a", [1, 3, 4, 2]) - >>> s.sort() - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - ] - >>> s.sort(descending=True) - shape: (4,) - Series: \'a\' [i64] - [ - 4 - 3 - 2 - 1 - ] - - ''' - def top_k(self, k: int | IntoExprColumn = ...) -> Series: - ''' - Return the `k` largest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - bottom_k - - Examples - -------- - >>> s = pl.Series("a", [2, 5, 1, 4, 3]) - >>> s.top_k(3) - shape: (3,) - Series: \'a\' [i64] - [ - 5 - 4 - 3 - ] - - ''' - def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: - ''' - Return the `k` smallest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - top_k - - Examples - -------- - >>> s = pl.Series("a", [2, 5, 1, 4, 3]) - >>> s.bottom_k(3) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def arg_sort(self) -> Series: - ''' - Get the index values that would sort this Series. - - Parameters - ---------- - descending - Sort in descending order. - nulls_last - Place null values last instead of first. - - Examples - -------- - >>> s = pl.Series("a", [5, 3, 4, 1, 2]) - >>> s.arg_sort() - shape: (5,) - Series: \'a\' [u32] - [ - 3 - 4 - 1 - 2 - 0 - ] - - ''' - def arg_unique(self) -> Series: - ''' - Get unique index as Series. - - Returns - ------- - Series - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.arg_unique() - shape: (3,) - Series: \'a\' [u32] - [ - 0 - 1 - 3 - ] - - ''' - def arg_min(self) -> int | None: - ''' - Get the index of the minimal value. - - Returns - ------- - int - - Examples - -------- - >>> s = pl.Series("a", [3, 2, 1]) - >>> s.arg_min() - 2 - - ''' - def arg_max(self) -> int | None: - ''' - Get the index of the maximal value. - - Returns - ------- - int - - Examples - -------- - >>> s = pl.Series("a", [3, 2, 1]) - >>> s.arg_max() - 0 - - ''' - def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: - """ - Find indices where elements should be inserted to maintain order. - - .. math:: a[i-1] < v <= a[i] - - Parameters - ---------- - element - Expression or scalar value. - side : {'any', 'left', 'right'} - If 'any', the index of the first suitable location found is given. - If 'left', the index of the leftmost suitable location found is given. - If 'right', return the rightmost suitable location found is given. - - """ - def unique(self) -> Series: - ''' - Get unique elements in series. - - Parameters - ---------- - maintain_order - Maintain order of data. This requires more work. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.unique().sort() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: - ''' - Take values by index. - - Parameters - ---------- - indices - Index location used for selection. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4]) - >>> s.gather([1, 3]) - shape: (2,) - Series: \'a\' [i64] - [ - 2 - 4 - ] - - ''' - def null_count(self) -> int: - """Count the null values in this Series.""" - def has_validity(self) -> bool: - """ - Return True if the Series has a validity bitmask. - - If there is no mask, it means that there are no `null` values. - - Notes - ----- - While the *absence* of a validity bitmask guarantees that a Series does not - have `null` values, the converse is not true, eg: the *presence* of a - bitmask does not mean that there are null values, as every value of the - bitmask could be `false`. - - To confirm that a column has `null` values use :func:`null_count`. - - """ - def is_empty(self) -> bool: - ''' - Check if the Series is empty. - - Examples - -------- - >>> s = pl.Series("a", [], dtype=pl.Float32) - >>> s.is_empty() - True - - ''' - def is_sorted(self) -> bool: - """ - Check if the Series is sorted. - - Parameters - ---------- - descending - Check if the Series is sorted in descending order - - """ - def not_(self) -> Series: - ''' - Negate a boolean Series. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [True, False, False]) - >>> s.not_() - shape: (3,) - Series: \'a\' [bool] - [ - false - true - true - ] - - ''' - def is_null(self) -> Series: - ''' - Returns a boolean Series indicating which values are null. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) - >>> s.is_null() - shape: (4,) - Series: \'a\' [bool] - [ - false - false - false - true - ] - - ''' - def is_not_null(self) -> Series: - ''' - Returns a boolean Series indicating which values are not null. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) - >>> s.is_not_null() - shape: (4,) - Series: \'a\' [bool] - [ - true - true - true - false - ] - - ''' - def is_finite(self) -> Series: - ''' - Returns a boolean Series indicating which values are finite. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, np.inf]) - >>> s.is_finite() - shape: (3,) - Series: \'a\' [bool] - [ - true - true - false - ] - - ''' - def is_infinite(self) -> Series: - ''' - Returns a boolean Series indicating which values are infinite. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, np.inf]) - >>> s.is_infinite() - shape: (3,) - Series: \'a\' [bool] - [ - false - false - true - ] - - ''' - def is_nan(self) -> Series: - ''' - Returns a boolean Series indicating which values are not NaN. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) - >>> s.is_nan() - shape: (4,) - Series: \'a\' [bool] - [ - false - false - false - true - ] - - ''' - def is_not_nan(self) -> Series: - ''' - Returns a boolean Series indicating which values are not NaN. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) - >>> s.is_not_nan() - shape: (4,) - Series: \'a\' [bool] - [ - true - true - true - false - ] - - ''' - def is_in(self, other: Series | Collection[Any]) -> Series: - ''' - Check if elements of this Series are in the other Series. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("b", [2, 4]) - >>> s2.is_in(s) - shape: (2,) - Series: \'b\' [bool] - [ - true - false - ] - - >>> # check if some values are a member of sublists - >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) - >>> optional_members = pl.Series("optional_members", [1, 2, 3]) - >>> print(sets) - shape: (3,) - Series: \'sets\' [list[i64]] - [ - [1, 2, 3] - [1, 2] - [9, 10] - ] - >>> print(optional_members) - shape: (3,) - Series: \'optional_members\' [i64] - [ - 1 - 2 - 3 - ] - >>> optional_members.is_in(sets) - shape: (3,) - Series: \'optional_members\' [bool] - [ - true - true - false - ] - - ''' - def arg_true(self) -> Series: - ''' - Get index values where Boolean Series evaluate True. - - Returns - ------- - Series - Series of data type :class:`UInt32`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> (s == 2).arg_true() - shape: (1,) - Series: \'a\' [u32] - [ - 1 - ] - - ''' - def is_unique(self) -> Series: - ''' - Get mask of all unique values. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.is_unique() - shape: (4,) - Series: \'a\' [bool] - [ - true - false - false - true - ] - - ''' - def is_first_distinct(self) -> Series: - """ - Return a boolean mask indicating the first occurrence of each distinct value. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series([1, 1, 2, 3, 2]) - >>> s.is_first_distinct() - shape: (5,) - Series: '' [bool] - [ - true - false - true - true - false - ] - - """ - def is_last_distinct(self) -> Series: - """ - Return a boolean mask indicating the last occurrence of each distinct value. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series([1, 1, 2, 3, 2]) - >>> s.is_last_distinct() - shape: (5,) - Series: '' [bool] - [ - false - true - false - true - true - ] - - """ - def is_duplicated(self) -> Series: - ''' - Get mask of all duplicated values. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.is_duplicated() - shape: (4,) - Series: \'a\' [bool] - [ - false - true - true - false - ] - - ''' - def explode(self) -> Series: - """ - Explode a list Series. - - This means that every item is expanded to a new row. - - Returns - ------- - Series - Series with the data type of the list elements. - - See Also - -------- - Series.list.explode : Explode a list column. - Series.str.explode : Explode a string column. - - """ - def equals(self, other: Series) -> bool: - ''' - Check whether the Series is equal to another Series. - - Parameters - ---------- - other - Series to compare with. - null_equal - Consider null values as equal. - strict - Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a - `pl.Int64` will return `False`. - - See Also - -------- - assert_series_equal - - Examples - -------- - >>> s1 = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("b", [4, 5, 6]) - >>> s1.equals(s1) - True - >>> s1.equals(s2) - False - ''' - def len(self) -> int: - ''' - Return the number of elements in this Series. - - Null values are treated like regular elements in this context. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, None]) - >>> s.len() - 3 - - ''' - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: - ''' - Cast between data types. - - Parameters - ---------- - dtype - DataType to cast to. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> s = pl.Series("a", [True, False, True]) - >>> s - shape: (3,) - Series: \'a\' [bool] - [ - true - false - true - ] - - >>> s.cast(pl.UInt32) - shape: (3,) - Series: \'a\' [u32] - [ - 1 - 0 - 1 - ] - - ''' - def to_physical(self) -> Series: - ''' - Cast to physical representation of the logical dtype. - - - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` - - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` - - `List(inner)` -> `List(physical of inner)` - - Other data types will be left unchanged. - - Examples - -------- - Replicating the pandas - `pd.Series.factorize - `_ - method. - - >>> s = pl.Series("values", ["a", None, "x", "a"]) - >>> s.cast(pl.Categorical).to_physical() - shape: (4,) - Series: \'values\' [u32] - [ - 0 - null - 1 - 0 - ] - - ''' - def to_list(self) -> list[Any]: - ''' - Convert this Series to a Python List. This operation clones data. - - Parameters - ---------- - use_pyarrow - Use pyarrow for the conversion. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.to_list() - [1, 2, 3] - >>> type(s.to_list()) - - - ''' - def rechunk(self) -> Self: - """ - Create a single chunk of memory for this Series. - - Parameters - ---------- - in_place - In place or not. - - """ - def reverse(self) -> Series: - ''' - Return Series in reverse order. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) - >>> s.reverse() - shape: (3,) - Series: \'a\' [i8] - [ - 3 - 2 - 1 - ] - - ''' - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: - ''' - Get a boolean mask of the values that fall between the given start/end values. - - Parameters - ---------- - lower_bound - Lower bound value. Accepts expression input. Non-expression inputs - (including strings) are parsed as literals. - upper_bound - Upper bound value. Accepts expression input. Non-expression inputs - (including strings) are parsed as literals. - closed : {\'both\', \'left\', \'right\', \'none\'} - Define which sides of the interval are closed (inclusive). - - Examples - -------- - >>> s = pl.Series("num", [1, 2, 3, 4, 5]) - >>> s.is_between(2, 4) - shape: (5,) - Series: \'num\' [bool] - [ - false - true - true - true - false - ] - - Use the `closed` argument to include or exclude the values at the bounds: - - >>> s.is_between(2, 4, closed="left") - shape: (5,) - Series: \'num\' [bool] - [ - false - true - true - false - false - ] - - You can also use strings as well as numeric/temporal values: - - >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) - >>> s.is_between("b", "d", closed="both") - shape: (5,) - Series: \'s\' [bool] - [ - false - true - true - true - false - ] - - ''' - def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: - ''' - Convert this Series to numpy. - - This operation may clone data but is completely safe. Note that: - - - data which is purely numeric AND without null values is not cloned; - - floating point `nan` values can be zero-copied; - - booleans can\'t be zero-copied. - - To ensure that no data is cloned, set `zero_copy_only=True`. - - Parameters - ---------- - *args - args will be sent to pyarrow.Array.to_numpy. - zero_copy_only - If True, an exception will be raised if the conversion to a numpy - array would require copying the underlying data (e.g. in presence - of nulls, or for non-primitive types). - writable - For numpy arrays created with zero copy (view on the Arrow data), - the resulting array is not writable (Arrow data is immutable). - By setting this to True, a copy of the array is made to ensure - it is writable. - use_pyarrow - Use `pyarrow.Array.to_numpy - `_ - - for the conversion to numpy. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> arr = s.to_numpy() - >>> arr # doctest: +IGNORE_RESULT - array([1, 2, 3], dtype=int64) - >>> type(arr) - - - ''' - def _view(self) -> SeriesView: - ''' - Get a view into this Series data with a numpy array. - - This operation doesn\'t clone data, but does not include missing values. - - Returns - ------- - SeriesView - - Parameters - ---------- - ignore_nulls - If True then nulls are converted to 0. - If False then an Exception is raised if nulls are present. - - Examples - -------- - >>> s = pl.Series("a", [1, None]) - >>> s._view(ignore_nulls=True) - SeriesView([1, 0]) - - ''' - def to_arrow(self) -> pa.Array: - ''' - Get the underlying Arrow Array. - - If the Series contains only a single chunk this operation is zero copy. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s = s.to_arrow() - >>> s # doctest: +ELLIPSIS - - [ - 1, - 2, - 3 - ] - - ''' - def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: - ''' - Convert this Series to a pandas Series. - - This requires that :mod:`pandas` and :mod:`pyarrow` are installed. - This operation clones data, unless `use_pyarrow_extension_array=True`. - - Parameters - ---------- - use_pyarrow_extension_array - Further operations on this Pandas series, might trigger conversion to numpy. - Use PyArrow backed-extension array instead of numpy array for pandas - Series. This allows zero copy operations and preservation of nulls - values. - Further operations on this pandas Series, might trigger conversion - to NumPy arrays if that operation is not supported by pyarrow compute - functions. - kwargs - Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. - - Examples - -------- - >>> s1 = pl.Series("a", [1, 2, 3]) - >>> s1.to_pandas() - 0 1 - 1 2 - 2 3 - Name: a, dtype: int64 - >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP - 0 1 - 1 2 - 2 3 - Name: a, dtype: int64[pyarrow] - >>> s2 = pl.Series("b", [1, 2, None, 4]) - >>> s2.to_pandas() - 0 1.0 - 1 2.0 - 2 NaN - 3 4.0 - Name: b, dtype: float64 - >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP - 0 1 - 1 2 - 2 - 3 4 - Name: b, dtype: int64[pyarrow] - - ''' - def to_init_repr(self, n: int = ...) -> str: - ''' - Convert Series to instantiatable string representation. - - Parameters - ---------- - n - Only use first n elements. - - See Also - -------- - polars.Series.to_init_repr - polars.from_repr - - Examples - -------- - >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) - >>> print(s.to_init_repr()) - pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) - >>> s_from_str_repr = eval(s.to_init_repr()) - >>> s_from_str_repr - shape: (4,) - Series: \'a\' [i16] - [ - 1 - 2 - null - 4 - ] - - ''' - def set(self, filter: Series, value: int | float | str | bool | None) -> Series: - ''' - Set masked values. - - Parameters - ---------- - filter - Boolean mask. - value - Value with which to replace the masked values. - - Notes - ----- - Use of this function is frequently an anti-pattern, as it can - block optimisation (predicate pushdown, etc). Consider using - `pl.when(predicate).then(value).otherwise(self)` instead. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.set(s == 2, 10) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 10 - 3 - ] - - It is better to implement this as follows: - - >>> s.to_frame().select( - ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) - ... ) - shape: (3, 1) - ┌─────────┐ - │ literal │ - │ --- │ - │ i64 │ - ╞═════════╡ - │ 1 │ - │ 10 │ - │ 3 │ - └─────────┘ - - ''' - def scatter(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: - ''' - Set values at the index locations. - - Parameters - ---------- - indices - Integers representing the index locations. - values - Replacement values. - - Notes - ----- - Use of this function is frequently an anti-pattern, as it can - block optimization (predicate pushdown, etc). Consider using - `pl.when(predicate).then(value).otherwise(self)` instead. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.scatter(1, 10) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 10 - 3 - ] - - It is better to implement this as follows: - - >>> s.to_frame().with_row_count("row_nr").select( - ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) - ... ) - shape: (3, 1) - ┌─────────┐ - │ literal │ - │ --- │ - │ i64 │ - ╞═════════╡ - │ 1 │ - │ 10 │ - │ 3 │ - └─────────┘ - - ''' - def clear(self, n: int = ...) -> Series: - ''' - Create an empty copy of the current Series, with zero to \'n\' elements. - - The copy has an identical name/dtype, but no data. - - Parameters - ---------- - n - Number of (empty) elements to return in the cleared frame. - - See Also - -------- - clone : Cheap deepcopy/clone. - - Examples - -------- - >>> s = pl.Series("a", [None, True, False]) - >>> s.clear() - shape: (0,) - Series: \'a\' [bool] - [ - ] - - >>> s.clear(n=2) - shape: (2,) - Series: \'a\' [bool] - [ - null - null - ] - - ''' - def clone(self) -> Self: - ''' - Create a copy of this Series. - - This is a cheap operation that does not copy data. - - See Also - -------- - clear : Create an empty copy of the current Series, with identical - schema but no data. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.clone() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def fill_nan(self, value: int | float | Expr | None) -> Series: - ''' - Fill floating point NaN value with a fill value. - - Parameters - ---------- - value - Value used to fill NaN values. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, float("nan")]) - >>> s.fill_nan(0) - shape: (4,) - Series: \'a\' [f64] - [ - 1.0 - 2.0 - 3.0 - 0.0 - ] - - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: - ''' - Fill null values using the specified value or strategy. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, None]) - >>> s.fill_null(strategy="forward") - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 3 - ] - >>> s.fill_null(strategy="min") - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 1 - ] - >>> s = pl.Series("b", ["x", None, "z"]) - >>> s.fill_null(pl.lit("")) - shape: (3,) - Series: \'b\' [str] - [ - "x" - "" - "z" - ] - - ''' - def floor(self) -> Series: - ''' - Rounds down to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) - >>> s.floor() - shape: (3,) - Series: \'a\' [f64] - [ - 1.0 - 2.0 - 3.0 - ] - - ''' - def ceil(self) -> Series: - ''' - Rounds up to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) - >>> s.ceil() - shape: (3,) - Series: \'a\' [f64] - [ - 2.0 - 3.0 - 4.0 - ] - - ''' - def round(self, decimals: int = ...) -> Series: - ''' - Round underlying floating point data by `decimals` digits. - - Examples - -------- - >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) - >>> s.round(2) - shape: (3,) - Series: \'a\' [f64] - [ - 1.12 - 2.57 - 3.9 - ] - - Parameters - ---------- - decimals - number of decimals to round by. - - ''' - def round_sig_figs(self, digits: int) -> Series: - """ - Round to a number of significant figures. - - Parameters - ---------- - digits - Number of significant figures to round to. - - Examples - -------- - >>> s = pl.Series([0.01234, 3.333, 1234.0]) - >>> s.round_sig_figs(2) - shape: (3,) - Series: '' [f64] - [ - 0.012 - 3.3 - 1200.0 - ] - - """ - def dot(self, other: Series | ArrayLike) -> float | None: - ''' - Compute the dot/inner product between two Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) - >>> s.dot(s2) - 32.0 - - Parameters - ---------- - other - Series (or array) to compute dot product with. - - ''' - def mode(self) -> Series: - ''' - Compute the most occurring value(s). - - Can return multiple Values. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.mode() - shape: (1,) - Series: \'a\' [i64] - [ - 2 - ] - - ''' - def sign(self) -> Series: - ''' - Compute the element-wise indication of the sign. - - The returned values can be -1, 0, or 1: - - * -1 if x < 0. - * 0 if x == 0. - * 1 if x > 0. - - (null values are preserved as-is). - - Examples - -------- - >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) - >>> s.sign() - shape: (5,) - Series: \'a\' [i64] - [ - -1 - 0 - 0 - 1 - null - ] - - ''' - def sin(self) -> Series: - ''' - Compute the element-wise value for the sine. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.sin() - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 1.0 - 1.2246e-16 - ] - - ''' - def cos(self) -> Series: - ''' - Compute the element-wise value for the cosine. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.cos() - shape: (3,) - Series: \'a\' [f64] - [ - 1.0 - 6.1232e-17 - -1.0 - ] - - ''' - def tan(self) -> Series: - ''' - Compute the element-wise value for the tangent. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.tan() - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 1.6331e16 - -1.2246e-16 - ] - - ''' - def cot(self) -> Series: - ''' - Compute the element-wise value for the cotangent. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.cot() - shape: (3,) - Series: \'a\' [f64] - [ - inf - 6.1232e-17 - -8.1656e15 - ] - - ''' - def arcsin(self) -> Series: - ''' - Compute the element-wise value for the inverse sine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arcsin() - shape: (3,) - Series: \'a\' [f64] - [ - 1.570796 - 0.0 - -1.570796 - ] - - ''' - def arccos(self) -> Series: - ''' - Compute the element-wise value for the inverse cosine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arccos() - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 1.570796 - 3.141593 - ] - - ''' - def arctan(self) -> Series: - ''' - Compute the element-wise value for the inverse tangent. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arctan() - shape: (3,) - Series: \'a\' [f64] - [ - 0.785398 - 0.0 - -0.785398 - ] - - ''' - def arcsinh(self) -> Series: - ''' - Compute the element-wise value for the inverse hyperbolic sine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arcsinh() - shape: (3,) - Series: \'a\' [f64] - [ - 0.881374 - 0.0 - -0.881374 - ] - - ''' - def arccosh(self) -> Series: - ''' - Compute the element-wise value for the inverse hyperbolic cosine. - - Examples - -------- - >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) - >>> s.arccosh() - shape: (4,) - Series: \'a\' [f64] - [ - 2.292432 - 0.0 - NaN - NaN - ] - - ''' - def arctanh(self) -> Series: - ''' - Compute the element-wise value for the inverse hyperbolic tangent. - - Examples - -------- - >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) - >>> s.arctanh() - shape: (7,) - Series: \'a\' [f64] - [ - NaN - inf - 0.549306 - 0.0 - -0.549306 - -inf - NaN - ] - - ''' - def sinh(self) -> Series: - ''' - Compute the element-wise value for the hyperbolic sine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.sinh() - shape: (3,) - Series: \'a\' [f64] - [ - 1.175201 - 0.0 - -1.175201 - ] - - ''' - def cosh(self) -> Series: - ''' - Compute the element-wise value for the hyperbolic cosine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.cosh() - shape: (3,) - Series: \'a\' [f64] - [ - 1.543081 - 1.0 - 1.543081 - ] - - ''' - def tanh(self) -> Series: - ''' - Compute the element-wise value for the hyperbolic tangent. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.tanh() - shape: (3,) - Series: \'a\' [f64] - [ - 0.761594 - 0.0 - -0.761594 - ] - - ''' - def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - ''' - Map a custom/user-defined function (UDF) over elements in this Series. - - .. warning:: - This method is much slower than the native expressions API. - Only use it if you cannot implement your logic otherwise. - - If the function returns a different datatype, the return_dtype arg should - be set, otherwise the method will fail. - - Implementing logic using a Python function is almost always *significantly* - slower and more memory intensive than implementing the same logic using - the native expression API because: - - - The native expression engine runs in Rust; UDFs run in Python. - - Use of Python UDFs forces the DataFrame to be materialized in memory. - - Polars-native expressions can be parallelised (UDFs typically cannot). - - Polars-native expressions can be logically optimised (UDFs cannot). - - Wherever possible you should strongly prefer the native expression API - to achieve the best performance. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output datatype. If none is given, the same datatype as this Series will be - used. - skip_nulls - Nulls will be skipped and not passed to the python function. - This is faster because python can be skipped and because we call - more specialized functions. - - Warnings - -------- - If `return_dtype` is not provided, this may lead to unexpected results. - We allow this, but it is considered a bug in the user\'s query. - - Notes - ----- - If your function is expensive and you don\'t want it to be called more than - once for a given input, consider applying an `@lru_cache` decorator to it. - If your data is suitable you may achieve *significant* speedups. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP - shape: (3,) - Series: \'a\' [i64] - [ - 11 - 12 - 13 - ] - - Returns - ------- - Series - - ''' - def shift(self, n: int = ...) -> Series: - """ - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. Accepts expression input. - Non-expression inputs are parsed as literals. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> s = pl.Series([1, 2, 3, 4]) - >>> s.shift() - shape: (4,) - Series: '' [i64] - [ - null - 1 - 2 - 3 - ] - - Pass a negative value to shift in the opposite direction instead. - - >>> s.shift(-2) - shape: (4,) - Series: '' [i64] - [ - 3 - 4 - null - null - ] - - Specify `fill_value` to fill the resulting null values. - - >>> s.shift(-2, fill_value=100) - shape: (4,) - Series: '' [i64] - [ - 3 - 4 - 100 - 100 - ] - - """ - def zip_with(self, mask: Series, other: Series) -> Self: - """ - Take values from self or other based on the given mask. - - Where mask evaluates true, take values from self. Where mask evaluates false, - take values from other. - - Parameters - ---------- - mask - Boolean Series. - other - Series of same type. - - Returns - ------- - Series - - Examples - -------- - >>> s1 = pl.Series([1, 2, 3, 4, 5]) - >>> s2 = pl.Series([5, 4, 3, 2, 1]) - >>> s1.zip_with(s1 < s2, s2) - shape: (5,) - Series: '' [i64] - [ - 1 - 2 - 3 - 2 - 1 - ] - >>> mask = pl.Series([True, False, True, False, True]) - >>> s1.zip_with(mask, s2) - shape: (5,) - Series: '' [i64] - [ - 1 - 4 - 3 - 2 - 5 - ] - - """ - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling min (moving min) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their min. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [100, 200, 300, 400, 500]) - >>> s.rolling_min(window_size=3) - shape: (5,) - Series: \'a\' [i64] - [ - null - null - 100 - 200 - 300 - ] - - ''' - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling max (moving max) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their max. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [100, 200, 300, 400, 500]) - >>> s.rolling_max(window_size=2) - shape: (5,) - Series: \'a\' [i64] - [ - null - 200 - 300 - 400 - 500 - ] - - ''' - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling mean (moving mean) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their mean. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [100, 200, 300, 400, 500]) - >>> s.rolling_mean(window_size=2) - shape: (5,) - Series: \'a\' [f64] - [ - null - 150.0 - 250.0 - 350.0 - 450.0 - ] - - ''' - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling sum (moving sum) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their sum. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length of the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.rolling_sum(window_size=2) - shape: (5,) - Series: \'a\' [i64] - [ - null - 3 - 5 - 7 - 9 - ] - - ''' - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling std dev. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their std dev. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_std(window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.0 - 1.0 - 1.527525 - 2.0 - ] - - ''' - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling variance. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their variance. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_var(window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.0 - 1.0 - 2.333333 - 4.0 - ] - - ''' - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a custom rolling window function. - - .. warning:: - Computing custom functions is extremely slow. Use specialized rolling - functions such as :func:`Series.rolling_sum` if at all possible. - - Parameters - ---------- - function - Custom aggregation function. - window_size - Size of the window. The window at a given row will include the row - itself and the `window_size - 1` elements before it. - weights - A list of weights with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window. - - Warnings - -------- - - - Examples - -------- - >>> from numpy import nansum - >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) - >>> s.rolling_map(nansum, window_size=3) - shape: (5,) - Series: \'\' [f64] - [ - null - null - 22.0 - 11.0 - 17.0 - ] - - ''' - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling median. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_median(window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 2.0 - 3.0 - 4.0 - 6.0 - ] - - ''' - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling quantile. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_quantile(quantile=0.33, window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.0 - 2.0 - 3.0 - 4.0 - ] - >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.66 - 2.66 - 3.66 - 5.32 - ] - - ''' - def rolling_skew(self, window_size: int) -> Series: - """ - Compute a rolling skew. - - The window at a given row includes the row itself and the - `window_size - 1` elements before it. - - Parameters - ---------- - window_size - Integer size of the rolling window. - bias - If False, the calculations are corrected for statistical bias. - - Examples - -------- - >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) - shape: (4,) - Series: '' [f64] - [ - null - null - 0.381802 - 0.47033 - ] - - Note how the values match - - >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() - (0.38180177416060584, 0.47033046033698594) - - """ - def sample(self, n: int | None = ...) -> Series: - ''' - Sample from this Series. - - Parameters - ---------- - n - Number of items to return. Cannot be used with `fraction`. Defaults to 1 if - `fraction` is None. - fraction - Fraction of items to return. Cannot be used with `n`. - with_replacement - Allow values to be sampled more than once. - shuffle - Shuffle the order of sampled data points. - seed - Seed for the random number generator. If set to None (default), a - random seed is generated for each sample operation. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 5 - ] - - ''' - def peak_max(self) -> Self: - ''' - Get a boolean mask of the local maximum peaks. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.peak_max() - shape: (5,) - Series: \'a\' [bool] - [ - false - false - false - false - true - ] - - ''' - def peak_min(self) -> Self: - ''' - Get a boolean mask of the local minimum peaks. - - Examples - -------- - >>> s = pl.Series("a", [4, 1, 3, 2, 5]) - >>> s.peak_min() - shape: (5,) - Series: \'a\' [bool] - [ - false - true - false - true - false - ] - - ''' - def n_unique(self) -> int: - ''' - Count the number of unique values in this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.n_unique() - 3 - - ''' - def shrink_to_fit(self) -> Series: - """ - Shrink Series memory usage. - - Shrinks the underlying array capacity to exactly fit the actual data. - (Note that this function does not change the Series data type). - - """ - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: - ''' - Hash the Series. - - The hash value is of type `UInt64`. - - Parameters - ---------- - seed - Random seed parameter. Defaults to 0. - seed_1 - Random seed parameter. Defaults to `seed` if not set. - seed_2 - Random seed parameter. Defaults to `seed` if not set. - seed_3 - Random seed parameter. Defaults to `seed` if not set. - - Notes - ----- - This implementation of :func:`hash` does not guarantee stable results - across different Polars versions. Its stability is only guaranteed within a - single version. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.hash(seed=42) # doctest: +IGNORE_RESULT - shape: (3,) - Series: \'a\' [u64] - [ - 10734580197236529959 - 3022416320763508302 - 13756996518000038261 - ] - - ''' - def reinterpret(self) -> Series: - """ - Reinterpret the underlying bits as a signed/unsigned integer. - - This operation is only allowed for 64bit integers. For lower bits integers, - you can safely use that cast operation. - - Parameters - ---------- - signed - If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. - - """ - def interpolate(self, method: InterpolationMethod = ...) -> Series: - ''' - Fill null values using interpolation. - - Parameters - ---------- - method : {\'linear\', \'nearest\'} - Interpolation method. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, None, None, 5]) - >>> s.interpolate() - shape: (5,) - Series: \'a\' [f64] - [ - 1.0 - 2.0 - 3.0 - 4.0 - 5.0 - ] - - ''' - def abs(self) -> Series: - """ - Compute absolute values. - - Same as `abs(series)`. - """ - def rank(self, method: RankMethod = ...) -> Series: - ''' - Assign ranks to data, dealing with ties appropriately. - - Parameters - ---------- - method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} - The method used to assign ranks to tied elements. - The following methods are available (default is \'average\'): - - - \'average\' : The average of the ranks that would have been assigned to - all the tied values is assigned to each value. - - \'min\' : The minimum of the ranks that would have been assigned to all - the tied values is assigned to each value. (This is also referred to - as "competition" ranking.) - - \'max\' : The maximum of the ranks that would have been assigned to all - the tied values is assigned to each value. - - \'dense\' : Like \'min\', but the rank of the next highest element is - assigned the rank immediately after those assigned to the tied - elements. - - \'ordinal\' : All values are given a distinct rank, corresponding to - the order that the values occur in the Series. - - \'random\' : Like \'ordinal\', but the rank for ties is not dependent - on the order that the values occur in the Series. - descending - Rank in descending order. - seed - If `method="random"`, use this as seed. - - Examples - -------- - The \'average\' method: - - >>> s = pl.Series("a", [3, 6, 1, 1, 6]) - >>> s.rank() - shape: (5,) - Series: \'a\' [f64] - [ - 3.0 - 4.5 - 1.5 - 1.5 - 4.5 - ] - - The \'ordinal\' method: - - >>> s = pl.Series("a", [3, 6, 1, 1, 6]) - >>> s.rank("ordinal") - shape: (5,) - Series: \'a\' [u32] - [ - 3 - 4 - 1 - 2 - 5 - ] - - ''' - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: - ''' - Calculate the first discrete difference between shifted items. - - Parameters - ---------- - n - Number of slots to shift. - null_behavior : {\'ignore\', \'drop\'} - How to handle null values. - - Examples - -------- - >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) - >>> s.diff() - shape: (5,) - Series: \'s\' [i8] - [ - null - -10 - 20 - -5 - 10 - ] - - >>> s.diff(n=2) - shape: (5,) - Series: \'s\' [i8] - [ - null - null - 10 - 15 - 5 - ] - - >>> s.diff(n=2, null_behavior="drop") - shape: (3,) - Series: \'s\' [i8] - [ - 10 - 15 - 5 - ] - - ''' - def pct_change(self, n: int | IntoExprColumn = ...) -> Series: - """ - Computes percentage change between values. - - Percentage change (as fraction) between current element and most-recent - non-null element at least `n` period(s) before the current element. - - Computes the change from the previous row by default. - - Parameters - ---------- - n - periods to shift for forming percent change. - - Examples - -------- - >>> pl.Series(range(10)).pct_change() - shape: (10,) - Series: '' [f64] - [ - null - inf - 1.0 - 0.5 - 0.333333 - 0.25 - 0.2 - 0.166667 - 0.142857 - 0.125 - ] - - >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) - shape: (10,) - Series: '' [f64] - [ - null - null - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - ] - - """ - def skew(self) -> float | None: - """ - Compute the sample skewness of a data set. - - For normally distributed data, the skewness should be about zero. For - unimodal continuous distributions, a skewness value greater than zero means - that there is more weight in the right tail of the distribution. The - function `skewtest` can be used to determine if the skewness value - is close enough to zero, statistically speaking. - - - See scipy.stats for more information. - - Parameters - ---------- - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - Notes - ----- - The sample skewness is computed as the Fisher-Pearson coefficient - of skewness, i.e. - - .. math:: g_1=\\frac{m_3}{m_2^{3/2}} - - where - - .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i - - is the biased sample :math:`i\\texttt{th}` central moment, and - :math:`\\bar{x}` is - the sample mean. If `bias` is False, the calculations are - corrected for bias and the value computed is the adjusted - Fisher-Pearson standardized moment coefficient, i.e. - - .. math:: - G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} - - """ - def kurtosis(self) -> float | None: - """ - Compute the kurtosis (Fisher or Pearson) of a dataset. - - Kurtosis is the fourth central moment divided by the square of the - variance. If Fisher's definition is used, then 3.0 is subtracted from - the result to give 0.0 for a normal distribution. - If bias is False then the kurtosis is calculated using k statistics to - eliminate bias coming from biased moment estimators - - See scipy.stats for more information - - Parameters - ---------- - fisher : bool, optional - If True, Fisher's definition is used (normal ==> 0.0). If False, - Pearson's definition is used (normal ==> 3.0). - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - """ - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: - """ - Set values outside the given boundaries to the boundary value. - - Parameters - ---------- - lower_bound - Lower bound. Accepts expression input. - Non-expression inputs are parsed as literals. - If set to `None` (default), no lower bound is applied. - upper_bound - Upper bound. Accepts expression input. - Non-expression inputs are parsed as literals. - If set to `None` (default), no upper bound is applied. - - See Also - -------- - when - - Notes - ----- - This method only works for numeric and temporal columns. To clip other data - types, consider writing a `when-then-otherwise` expression. See :func:`when`. - - Examples - -------- - Specifying both a lower and upper bound: - - >>> s = pl.Series([-50, 5, 50, None]) - >>> s.clip(1, 10) - shape: (4,) - Series: '' [i64] - [ - 1 - 5 - 10 - null - ] - - Specifying only a single bound: - - >>> s.clip(upper_bound=10) - shape: (4,) - Series: '' [i64] - [ - -50 - 5 - 10 - null - ] - - """ - def lower_bound(self) -> Self: - ''' - Return the lower bound of this Series\' dtype as a unit Series. - - See Also - -------- - upper_bound : return the upper bound of the given Series\' dtype. - - Examples - -------- - >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) - >>> s.lower_bound() - shape: (1,) - Series: \'s\' [i32] - [ - -2147483648 - ] - - >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) - >>> s.lower_bound() - shape: (1,) - Series: \'s\' [f32] - [ - -inf - ] - - ''' - def upper_bound(self) -> Self: - ''' - Return the upper bound of this Series\' dtype as a unit Series. - - See Also - -------- - lower_bound : return the lower bound of the given Series\' dtype. - - Examples - -------- - >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) - >>> s.upper_bound() - shape: (1,) - Series: \'s\' [i8] - [ - 127 - ] - - >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) - >>> s.upper_bound() - shape: (1,) - Series: \'s\' [f64] - [ - inf - ] - - ''' - def replace(self, mapping: dict[Any, Any]) -> Self: - ''' - Replace values according to the given mapping. - - Needs a global string cache for lazily evaluated queries on columns of - type `Categorical`. - - Parameters - ---------- - mapping - Mapping of values to their replacement. - default - Value to use when the mapping does not contain the lookup value. - Defaults to keeping the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - - See Also - -------- - str.replace - - Examples - -------- - Replace a single value by another value. Values not in the mapping remain - unchanged. - - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.replace({2: 100}) - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 100 - 100 - 3 - ] - - Replace multiple values. Specify a default to set values not in the given map - to the default value. - - >>> s = pl.Series("country_code", ["FR", "ES", "DE", None]) - >>> country_code_map = { - ... "CA": "Canada", - ... "DE": "Germany", - ... "FR": "France", - ... None: "unspecified", - ... } - >>> s.replace(country_code_map, default=None) - shape: (4,) - Series: \'country_code\' [str] - [ - "France" - null - "Germany" - "unspecified" - ] - - The return type can be overridden with the `return_dtype` argument. - - >>> s = pl.Series("a", [0, 1, 2, 3]) - >>> s.replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) - shape: (4,) - Series: \'a\' [u8] - [ - 0 - 10 - 20 - 0 - ] - ''' - def reshape(self, dimensions: tuple[int, ...]) -> Series: - ''' - Reshape this Series to a flat Series or a Series of Lists. - - Parameters - ---------- - dimensions - Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that - dimension is inferred. - - Returns - ------- - Series - If a single dimension is given, results in a Series of the original - data type. - If a multiple dimensions are given, results in a Series of data type - :class:`List` with shape (rows, cols). - - See Also - -------- - Series.list.explode : Explode a list column. - - Examples - -------- - >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) - >>> s.reshape((3, 3)) - shape: (3,) - Series: \'foo\' [list[i64]] - [ - [1, 2, 3] - [4, 5, 6] - [7, 8, 9] - ] - - ''' - def shuffle(self, seed: int | None = ...) -> Series: - ''' - Shuffle the contents of this Series. - - Parameters - ---------- - seed - Seed for the random number generator. If set to None (default), a - random seed is generated each time the shuffle is called. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.shuffle(seed=1) - shape: (3,) - Series: \'a\' [i64] - [ - 2 - 1 - 3 - ] - - ''' - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: - """ - Exponentially-weighted moving average. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> s = pl.Series([1, 2, 3]) - >>> s.ewm_mean(com=1) - shape: (3,) - Series: '' [f64] - [ - 1.0 - 1.666667 - 2.428571 - ] - - """ - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: - ''' - Exponentially-weighted moving standard deviation. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.ewm_std(com=1) - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 0.707107 - 0.963624 - ] - - ''' - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: - ''' - Exponentially-weighted moving variance. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.ewm_var(com=1) - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 0.5 - 0.928571 - ] - - ''' - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: - """ - Extremely fast method for extending the Series with 'n' copies of a value. - - Parameters - ---------- - value - A constant literal value (not an expression) with which to extend - the Series; can pass None to extend with nulls. - n - The number of additional values that will be added. - - Examples - -------- - >>> s = pl.Series([1, 2, 3]) - >>> s.extend_constant(99, n=2) - shape: (5,) - Series: '' [i64] - [ - 1 - 2 - 3 - 99 - 99 - ] - - """ - def set_sorted(self) -> Self: - ''' - Flags the Series as \'sorted\'. - - Enables downstream code to user fast paths for sorted arrays. - - Parameters - ---------- - descending - If the `Series` order is descending. - - Warnings - -------- - This can lead to incorrect results if this `Series` is not sorted!! - Use with care! - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.set_sorted().max() - 3 - - ''' - def new_from_index(self, index: int, length: int) -> Self: - """Create a new Series filled with values from the given index.""" - def shrink_dtype(self) -> Series: - """ - Shrink numeric columns to the minimal required datatype. - - Shrink to the dtype needed to fit the extrema of this [`Series`]. - This can be used to reduce memory pressure. - """ - def get_chunks(self) -> list[Series]: - """Get the chunks of this Series as a list of Series.""" - def implode(self) -> Self: - """Aggregate values into a list.""" - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - """ - Apply a custom/user-defined function (UDF) over elements in this Series. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Series.map_elements`. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output datatype. If none is given, the same datatype as this Series will be - used. - skip_nulls - Nulls will be skipped and not passed to the python function. - This is faster because python can be skipped and because we call - more specialized functions. - - """ - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - """ - Apply a custom rolling window function. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Series.rolling_map`. - - Parameters - ---------- - function - Aggregation function - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - """ - def is_first(self) -> Series: - """ - Return a boolean mask indicating the first occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Series.is_first_distinct`. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - """ - def is_last(self) -> Series: - """ - Return a boolean mask indicating the last occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Series.is_last_distinct`. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - """ - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: - """ - Clip (limit) the values in an array to a `min` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - lower_bound - Lower bound. - - """ - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: - """ - Clip (limit) the values in an array to a `max` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - upper_bound - Upper bound. - - """ - def shift_and_fill(self, fill_value: int | Expr) -> Series: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - Fill None values with the result of this expression. - n - Number of places to shift (may be negative). - - """ - def is_float(self) -> bool: - ''' - Check if this Series has floating point numbers. - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_float()` instead. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0]) - >>> s.is_float() # doctest: +SKIP - True - - ''' - def is_integer(self, signed: bool | None = ...) -> bool: - ''' - Check if this Series datatype is an integer (signed or unsigned). - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_integer()` instead. - For signed/unsigned variants, use `Series.dtype.is_signed_integer()` - or `Series.dtype.is_unsigned_integer()`. - - Parameters - ---------- - signed - * if `None`, both signed and unsigned integer dtypes will match. - * if `True`, only signed integer dtypes will be considered a match. - * if `False`, only unsigned integer dtypes will be considered a match. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) - >>> s.is_integer() # doctest: +SKIP - True - >>> s.is_integer(signed=False) # doctest: +SKIP - True - >>> s.is_integer(signed=True) # doctest: +SKIP - False - - ''' - def is_numeric(self) -> bool: - ''' - Check if this Series datatype is numeric. - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_float()` instead. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.is_numeric() # doctest: +SKIP - True - - ''' - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: - """ - Check if this Series datatype is temporal. - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_temporal()` instead. - - Parameters - ---------- - excluding - Optionally exclude one or more temporal dtypes from matching. - - Examples - -------- - >>> from datetime import date - >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) - >>> s.is_temporal() # doctest: +SKIP - True - >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP - False - - """ - def is_boolean(self) -> bool: - ''' - Check if this Series is a Boolean. - - .. deprecated:: 0.19.14 - Use `Series.dtype == pl.Boolean` instead. - - Examples - -------- - >>> s = pl.Series("a", [True, False, True]) - >>> s.is_boolean() # doctest: +SKIP - True - - ''' - def is_utf8(self) -> bool: - ''' - Check if this Series datatype is a Utf8. - - .. deprecated:: 0.19.14 - Use `Series.dtype == pl.Utf8` instead. - - Examples - -------- - >>> s = pl.Series("x", ["a", "b", "c"]) - >>> s.is_utf8() # doctest: +SKIP - True - - ''' - def take_every(self, n: int) -> Series: - """ - Take every nth value in the Series and return as new Series. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: - """ - Take values by index. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather`. - - Parameters - ---------- - indices - Index location used for selection. - """ - def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: - """ - Set values at the index locations. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`scatter`. - - Parameters - ---------- - indices - Integers representing the index locations. - values - Replacement values. - """ - def cumsum(self) -> Series: - """ - Get an array with the cumulative sum computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_sum`. - - Parameters - ---------- - reverse - reverse the operation. - - """ - def cummax(self) -> Series: - """ - Get an array with the cumulative max computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_max`. - - Parameters - ---------- - reverse - reverse the operation. - """ - def cummin(self) -> Series: - """ - Get an array with the cumulative min computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_min`. - - Parameters - ---------- - reverse - reverse the operation. - """ - def cumprod(self) -> Series: - """ - Get an array with the cumulative product computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_prod`. - - Parameters - ---------- - reverse - reverse the operation. - """ - def view(self) -> SeriesView: - """ - Get a view into this Series data with a numpy array. - - .. deprecated:: 0.19.14 - This method will be removed in a future version. - - This operation doesn't clone data, but does not include missing values. - Don't use this unless you know what you are doing. - - Parameters - ---------- - ignore_nulls - If True then nulls are converted to 0. - If False then an Exception is raised if nulls are present. - - """ - def map_dict(self, mapping: dict[Any, Any]) -> Self: - """ - Replace values in the Series using a remapping dictionary. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`replace`. The default behavior - has changed to keep any values not present in the mapping unchanged. - Pass `default=None` to keep existing behavior. - - Parameters - ---------- - mapping - Dictionary containing the before/after values to map. - default - Value to use when the remapping dict does not contain the lookup value. - Use `pl.first()`, to keep the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - """ - def series_equal(self, other: Series) -> bool: - """ - Check whether the Series is equal to another Series. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`equals`. - - Parameters - ---------- - other - Series to compare with. - null_equal - Consider null values as equal. - strict - Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a - `pl.Int64` will return `False`. - """ - @property - def dtype(self): ... - @property - def flags(self): ... - @property - def inner_dtype(self): ... - @property - def name(self): ... - @property - def shape(self): ... - @property - def bin(self): ... - @property - def cat(self): ... - @property - def dt(self): ... - @property - def list(self): ... - @property - def arr(self): ... - @property - def str(self): ... - @property - def struct(self): ... -def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: - """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/series/series.pyi new file mode 100644 index 0000000..a2385bc --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.1/polars/series/series.pyi @@ -0,0 +1,5035 @@ +#: version 0.20.1 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Enum as Enum, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Null as Null, Object as Object, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import ModuleUpgradeRequired as ModuleUpgradeRequired, ShapeError as ShapeError +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, _warn_null_comparison as _warn_null_comparison, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the `Series` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series <= other`.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series < other`.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series == other`.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series == other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series != other`.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series != other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series >= other`.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series > other`.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + ''' + Return the Series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to `s[0]`, with a check + that the shape is (1,). With an index, this is equivalent to `s[index]`. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cum_sum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a Series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + Series has a numeric dtype). All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> s = pl.Series([1, 2, 3, 4, 5]) + >>> s.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + Non-numeric data types may not have all statistics available. + + >>> s = pl.Series(["a", "a", None, "b", "c"]) + >>> s.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 4 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + include_breakpoint + Include a column that indicates the upper breakpoint. + include_category + Include a column that shows the intervals as categories. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬───────┐ + │ break_point ┆ category ┆ count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═══════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴───────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬───────┐ + │ color ┆ count │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═══════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴───────┘ + + Sort the output by count. + + >>> s.value_counts(sort=True) + shape: (3, 2) + ┌───────┬───────┐ + │ color ┆ count │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═══════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴───────┘ + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cum_max(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cum_max() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cum_min(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cum_min() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cum_prod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_prod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cum_sum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_sum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + The resulting series will consist of multiple chunks. + + Parameters + ---------- + other + Series to append. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from `append`, which adds the chunks from `other` to the chunks of + this series, `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer `append` over `extend` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single `Series`. In the latter case, finish the sequence + of `append` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + head + + """ + def gather_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no `null` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have `null` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be `false`. + + To confirm that a column has `null` values use :func:`null_count`. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def equals(self, other: Series) -> bool: + ''' + Check whether the Series is equal to another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + See Also + -------- + assert_series_equal + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s1.equals(s1) + True + >>> s1.equals(s2) + False + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point `nan` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set `zero_copy_only=True`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def _view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + + Returns + ------- + SeriesView + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s._view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def count(self) -> int: + ''' + Return the number of non-null elements in the column. + + See Also + -------- + len + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.count() + 2 + ''' + def len(self) -> int: + ''' + Return the number of elements in the Series. + + Null values count towards the total. + + See Also + -------- + count + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.len() + 3 + ''' + def set(self, filter: Series, value: int | float | str | bool | None) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def scatter(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimization (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.scatter(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Create a copy of this Series. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def round_sig_figs(self, digits: int) -> Series: + """ + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> s = pl.Series([0.01234, 3.333, 1234.0]) + >>> s.round_sig_figs(2) + shape: (3,) + Series: '' [f64] + [ + 0.012 + 3.3 + 1200.0 + ] + + """ + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def cot(self) -> Series: + ''' + Compute the element-wise value for the cotangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cot() + shape: (3,) + Series: \'a\' [f64] + [ + inf + 6.1232e-17 + -8.1656e15 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, n: int = ...) -> Series: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> s = pl.Series([1, 2, 3, 4]) + >>> s.shift() + shape: (4,) + Series: '' [i64] + [ + null + 1 + 2 + 3 + ] + + Pass a negative value to shift in the opposite direction instead. + + >>> s.shift(-2) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + null + null + ] + + Specify `fill_value` to fill the resulting null values. + + >>> s.shift(-2, fill_value=100) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + 100 + 100 + ] + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their std dev. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: + """ + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no lower bound is applied. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no upper bound is applied. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> s = pl.Series([-50, 5, 50, None]) + >>> s.clip(1, 10) + shape: (4,) + Series: '' [i64] + [ + 1 + 5 + 10 + null + ] + + Specifying only a single bound: + + >>> s.clip(upper_bound=10) + shape: (4,) + Series: '' [i64] + [ + -50 + 5 + 10 + null + ] + + """ + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def replace(self, old: IntoExpr | Sequence[Any] | Mapping[Any, Any], new: IntoExpr | Sequence[Any] | NoDefault = ...) -> Self: + ''' + Replace values by different values. + + Parameters + ---------- + old + Value or sequence of values to replace. + Also accepts a mapping of values to their replacement as syntactic sugar for + `replace(new=Series(mapping.keys()), old=Series(mapping.values()))`. + new + Value or sequence of values to replace by. + Length must match the length of `old` or have length 1. + default + Set values that were not replaced to this value. + Defaults to keeping the original value. + Accepts expression input. Non-expression inputs are parsed as literals. + return_dtype + The data type of the resulting Series. If set to `None` (default), + the data type is determined automatically based on the other inputs. + + See Also + -------- + str.replace + + Notes + ----- + The global string cache must be enabled when replacing categorical values. + + Examples + -------- + Replace a single value by another value. Values that were not replaced remain + unchanged. + + >>> s = pl.Series([1, 2, 2, 3]) + >>> s.replace(2, 100) + shape: (4,) + Series: \'\' [i64] + [ + 1 + 100 + 100 + 3 + ] + + Replace multiple values by passing sequences to the `old` and `new` parameters. + + >>> s.replace([2, 3], [100, 200]) + shape: (4,) + Series: \'\' [i64] + [ + 1 + 100 + 100 + 200 + ] + + Passing a mapping with replacements is also supported as syntactic sugar. + Specify a default to set all values that were not matched. + + >>> mapping = {2: 100, 3: 200} + >>> s.replace(mapping, default=-1) + shape: (4,) + Series: \'\' [i64] + [ + -1 + 100 + 100 + 200 + ] + + + The default can be another Series. + + >>> default = pl.Series([2.5, 5.0, 7.5, 10.0]) + >>> s.replace(2, 100, default=default) + shape: (4,) + Series: \'\' [f64] + [ + 2.5 + 100.0 + 100.0 + 10.0 + ] + + Replacing by values of a different data type sets the return type based on + a combination of the `new` data type and either the original data type or the + default data type if it was set. + + >>> s = pl.Series(["x", "y", "z"]) + >>> mapping = {"x": 1, "y": 2, "z": 3} + >>> s.replace(mapping) + shape: (3,) + Series: \'\' [str] + [ + "1" + "2" + "3" + ] + >>> s.replace(mapping, default=None) + shape: (3,) + Series: \'\' [i64] + [ + 1 + 2 + 3 + ] + + Set the `return_dtype` parameter to control the resulting data type directly. + + >>> s.replace(mapping, return_dtype=pl.UInt8) + shape: (3,) + Series: \'\' [u8] + [ + 1 + 2 + 3 + ] + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.ewm_mean(com=1) + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.666667 + 2.428571 + ] + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() # doctest: +SKIP + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_integer()` instead. + For signed/unsigned variants, use `Series.dtype.is_signed_integer()` + or `Series.dtype.is_unsigned_integer()`. + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() # doctest: +SKIP + True + >>> s.is_integer(signed=False) # doctest: +SKIP + True + >>> s.is_integer(signed=True) # doctest: +SKIP + False + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_numeric()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() # doctest: +SKIP + True + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_temporal()` instead. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() # doctest: +SKIP + True + >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP + False + + """ + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Boolean` instead. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() # doctest: +SKIP + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Utf8` instead. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() # doctest: +SKIP + True + + ''' + def take_every(self, n: int) -> Series: + """ + Take every nth value in the Series and return as new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + Index location used for selection. + """ + def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + """ + Set values at the index locations. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`scatter`. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + """ + def cumsum(self) -> Series: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + reverse the operation. + + """ + def cummax(self) -> Series: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummin(self) -> Series: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cumprod(self) -> Series: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def view(self) -> SeriesView: + """ + Get a view into this Series data with a numpy array. + + .. deprecated:: 0.19.14 + This method will be removed in a future version. + + This operation doesn't clone data, but does not include missing values. + Don't use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in the Series using a remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + """ + def series_equal(self, other: Series) -> bool: + """ + Check whether the Series is equal to another Series. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`equals`. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: + """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/dataframe/frame deleted file mode 100644 index 562effd..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/dataframe/frame +++ /dev/null @@ -1,6977 +0,0 @@ -import P -import deltalake -import np as np -import pa as pa -import pd as pd -from _io import BytesIO, TextIOWrapper - -from builtins import PyDataFrame -from pathlib import Path -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes.classes import Boolean as Boolean, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 -from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.col import col as col -from polars.functions.lit import lit as lit -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte -from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors -from polars.slice import PolarsSlice as PolarsSlice -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence - -TYPE_CHECKING: bool -INTEGER_DTYPES: frozenset -N_INFER_DEFAULT: int -_PYARROW_AVAILABLE: bool -_dtype_str_repr: builtin_function_or_method - -class DataFrame: - _accessors: _ClassVar[set] = ... - columns: Incomplete - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: - """Construct Polars DataFrame from FFI PyDataFrame object.""" - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from a dictionary of sequences. - - Parameters - ---------- - data : dict of sequences - Two-dimensional data represented as a dictionary. dict must contain - Sequences. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - - """ - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from a sequence of sequences. - - Parameters - ---------- - data : Sequence of sequences - Two-dimensional data represented as a sequence of sequences. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - orient : {'col', 'row'}, default None - Whether to interpret two-dimensional data as columns or as rows. If None, - the orientation is inferred by matching the columns and data dimensions. If - this does not yield conclusive results, column orientation is used. - infer_schema_length - How many rows to scan to determine the column type. - - """ - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from a numpy ndarray. - - Parameters - ---------- - data : numpy ndarray - Two-dimensional data represented as a numpy ndarray. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - orient : {'col', 'row'}, default None - Whether to interpret two-dimensional data as columns or as rows. If None, - the orientation is inferred by matching the columns and data dimensions. If - this does not yield conclusive results, column orientation is used. - - """ - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from an Arrow table. - - This operation will be zero copy for the most part. Types that are not - supported by Polars may be cast to the closest supported type. - - Parameters - ---------- - data : arrow table, array, or sequence of sequences - Data representing an Arrow Table or Array. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - rechunk : bool, default True - Make sure that all data is in contiguous memory. - - """ - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a Polars DataFrame from a pandas DataFrame. - - Parameters - ---------- - data : pandas DataFrame - Two-dimensional data represented as a pandas DataFrame. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - rechunk : bool, default True - Make sure that all data is in contiguous memory. - nan_to_null : bool, default True - If the data contains NaN values they will be converted to null/None. - include_index : bool, default False - Load any non-default pandas indexes as columns. - - """ - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: - """ - Read a CSV file into a DataFrame. - - Use `pl.read_csv` to dispatch to this method. - - See Also - -------- - polars.io.read_csv - - """ - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: - """ - Read into a DataFrame from a parquet file. - - Use `pl.read_parquet` to dispatch to this method. - - See Also - -------- - polars.io.read_parquet - - """ - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: - """ - Read into a DataFrame from Apache Avro format. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - columns - Columns. - n_rows - Stop reading from Apache Avro file after reading `n_rows`. - - """ - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: - ''' - Read into a DataFrame from Arrow IPC file format. - - See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. - Arrow IPC files are also known as Feather (v2) files. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - columns - Columns to select. Accepts a list of column indices (starting at zero) or a - list of column names. - n_rows - Stop reading from IPC file after reading `n_rows`. - row_count_name - Row count name. - row_count_offset - Row count offset. - rechunk - Make sure that all data is contiguous. - memory_map - Memory map the file - - ''' - @classmethod - def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: - ''' - Read into a DataFrame from Arrow IPC record batch stream format. - - See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - columns - Columns to select. Accepts a list of column indices (starting at zero) or a - list of column names. - n_rows - Stop reading from IPC stream after reading `n_rows`. - row_count_name - Row count name. - row_count_offset - Row count offset. - rechunk - Make sure that all data is contiguous. - - ''' - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: - """ - Read into a DataFrame from a JSON file. - - Use `pl.read_json` to dispatch to this method. - - See Also - -------- - polars.io.read_json - - """ - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: - """ - Read into a DataFrame from a newline delimited JSON file. - - Use `pl.read_ndjson` to dispatch to this method. - - See Also - -------- - polars.io.read_ndjson - - """ - def _replace(self, column: str, new_column: Series) -> Self: - """Replace a column by a new Series (in place).""" - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: - """ - Numpy __array__ interface protocol. - - Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see - https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. - """ - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: - ''' - Convert to a dataframe object implementing the dataframe interchange protocol. - - Parameters - ---------- - nan_as_null - Overwrite null values in the data with `NaN`. - - .. warning:: - This functionality has not been implemented and the parameter will be - removed in a future version. - Setting this to `True` will raise a `NotImplementedError`. - allow_copy - Allow memory to be copied to perform the conversion. If set to `False`, - causes conversions that are not zero-copy to fail. - - Notes - ----- - Details on the Python dataframe interchange protocol: - https://data-apis.org/dataframe-protocol/latest/index.html - - Examples - -------- - Convert a Polars DataFrame to a generic dataframe object and access some - properties. - - >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) - >>> dfi = df.__dataframe__() - >>> dfi.num_rows() - 2 - >>> dfi.get_column(1).dtype - (, 64, \'g\', \'=\') - - ''' - def __dataframe_consortium_standard__(self) -> Any: - """ - Provide entry point to the Consortium DataFrame Standard API. - - This is developed and maintained outside of polars. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. - """ - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: - """Compare a DataFrame with another object.""" - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: - """Compare a DataFrame with another DataFrame.""" - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: - """Compare a DataFrame with a non-DataFrame object.""" - def _div(self, other: Any) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Series]: ... - def __reversed__(self) -> Iterator[Series]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: - """Get item. Does quite a lot. Read the comments.""" - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: - """ - Format output data in HTML for display in Jupyter Notebooks. - - Output rows and columns can be modified by setting the following ENVIRONMENT - variables: - - * POLARS_FMT_MAX_COLS: set the number of columns - * POLARS_FMT_MAX_ROWS: set the number of rows - - """ - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: - ''' - Return the DataFrame as a scalar, or return the element at the given row/column. - - Parameters - ---------- - row - Optional row index. - column - Optional column index or name. - - See Also - -------- - row: Get the values of a single row, either by index or by predicate. - - Notes - ----- - If row/col not provided, this is equivalent to `df[0,0]`, with a check that - the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> df.select((pl.col("a") * pl.col("b")).sum()).item() - 32 - >>> df.item(1, 1) - 5 - >>> df.item(2, "b") - 6 - - ''' - def to_arrow(self) -> pa.Table: - ''' - Collect the underlying arrow arrays in an Arrow Table. - - This operation is mostly zero copy. - - Data types that do copy: - - CategoricalType - - Examples - -------- - >>> df = pl.DataFrame( - ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} - ... ) - >>> df.to_arrow() - pyarrow.Table - foo: int64 - bar: large_string - ---- - foo: [[1,2,3,4,5,6]] - bar: [["a","b","c","d","e","f"]] - - ''' - def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: - ''' - Convert DataFrame to a dictionary mapping column name to values. - - Parameters - ---------- - as_series - True -> Values are Series - False -> Values are List[Any] - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4, 5], - ... "fruits": ["banana", "banana", "apple", "apple", "banana"], - ... "B": [5, 4, 3, 2, 1], - ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], - ... "optional": [28, 300, None, 2, -30], - ... } - ... ) - >>> df - shape: (5, 5) - ┌─────┬────────┬─────┬────────┬──────────┐ - │ A ┆ fruits ┆ B ┆ cars ┆ optional │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ - ╞═════╪════════╪═════╪════════╪══════════╡ - │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ - │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ - │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ - │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ - │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ - └─────┴────────┴─────┴────────┴──────────┘ - >>> df.to_dict(as_series=False) - {\'A\': [1, 2, 3, 4, 5], - \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], - \'B\': [5, 4, 3, 2, 1], - \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], - \'optional\': [28, 300, None, 2, -30]} - >>> df.to_dict(as_series=True) - {\'A\': shape: (5,) - Series: \'A\' [i64] - [ - 1 - 2 - 3 - 4 - 5 - ], \'fruits\': shape: (5,) - Series: \'fruits\' [str] - [ - "banana" - "banana" - "apple" - "apple" - "banana" - ], \'B\': shape: (5,) - Series: \'B\' [i64] - [ - 5 - 4 - 3 - 2 - 1 - ], \'cars\': shape: (5,) - Series: \'cars\' [str] - [ - "beetle" - "audi" - "beetle" - "beetle" - "beetle" - ], \'optional\': shape: (5,) - Series: \'optional\' [i64] - [ - 28 - 300 - null - 2 - -30 - ]} - - ''' - def to_dicts(self) -> list[dict[str, Any]]: - ''' - Convert every row to a dictionary of Python-native values. - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.to_dicts() - [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] - - ''' - def to_numpy(self) -> np.ndarray[Any, Any]: - ''' - Convert DataFrame to a 2D NumPy array. - - This operation clones data. - - Parameters - ---------- - structured - Optionally return a structured array, with field names and - dtypes that correspond to the DataFrame schema. - order - The index order of the returned NumPy array, either C-like or - Fortran-like. In general, using the Fortran-like index order is faster. - However, the C-like order might be more appropriate to use for downstream - applications to prevent cloning data, e.g. when reshaping into a - one-dimensional array. Note that this option only takes effect if - `structured` is set to `False` and the DataFrame dtypes allow for a - global dtype for all columns. - - Notes - ----- - If you\'re attempting to convert Utf8 to an array you\'ll need to install - `pyarrow`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.5, 7.0, 8.5], - ... "ham": ["a", "b", "c"], - ... }, - ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, - ... ) - - Export to a standard 2D numpy array. - - >>> df.to_numpy() - array([[1, 6.5, \'a\'], - [2, 7.0, \'b\'], - [3, 8.5, \'c\']], dtype=object) - - Export to a structured array, which can better-preserve individual - column data, such as name and dtype... - - >>> df.to_numpy(structured=True) - array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], - dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np - >>> df.to_numpy(structured=True).view(np.recarray) - rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], - dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: - ''' - Cast to a pandas DataFrame. - - This requires that :mod:`pandas` and :mod:`pyarrow` are installed. - This operation clones data, unless `use_pyarrow_extension_array=True`. - - Parameters - ---------- - use_pyarrow_extension_array - Use PyArrow backed-extension arrays instead of numpy arrays for each column - of the pandas DataFrame; this allows zero copy operations and preservation - of null values. Subsequent operations on the resulting pandas DataFrame may - trigger conversion to NumPy arrays if that operation is not supported by - pyarrow compute functions. - **kwargs - Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. - - Returns - ------- - :class:`pandas.DataFrame` - - Examples - -------- - >>> import pandas - >>> df1 = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> pandas_df1 = df1.to_pandas() - >>> type(pandas_df1) - - >>> pandas_df1.dtypes - foo int64 - bar int64 - ham object - dtype: object - >>> df2 = pl.DataFrame( - ... { - ... "foo": [1, 2, None], - ... "bar": [6, None, 8], - ... "ham": [None, "b", "c"], - ... } - ... ) - >>> pandas_df2 = df2.to_pandas() - >>> pandas_df2 - foo bar ham - 0 1.0 6.0 None - 1 2.0 NaN b - 2 NaN 8.0 c - >>> pandas_df2.dtypes - foo float64 - bar float64 - ham object - dtype: object - >>> pandas_df2_pa = df2.to_pandas( - ... use_pyarrow_extension_array=True - ... ) # doctest: +SKIP - >>> pandas_df2_pa # doctest: +SKIP - foo bar ham - 0 1 6 - 1 2 b - 2 8 c - >>> pandas_df2_pa.dtypes # doctest: +SKIP - foo int64[pyarrow] - bar int64[pyarrow] - ham large_string[pyarrow] - dtype: object - - ''' - def to_series(self, index: int = ...) -> Series: - ''' - Select column as Series at index location. - - Parameters - ---------- - index - Location of selection. - - See Also - -------- - get_column - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.to_series(1) - shape: (3,) - Series: \'bar\' [i64] - [ - 6 - 7 - 8 - ] - - ''' - def to_init_repr(self, n: int = ...) -> str: - ''' - Convert DataFrame to instantiatable string representation. - - Parameters - ---------- - n - Only use first n rows. - - See Also - -------- - polars.Series.to_init_repr - polars.from_repr - - Examples - -------- - >>> df = pl.DataFrame( - ... [ - ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), - ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), - ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), - ... ] - ... ) - >>> print(df.to_init_repr()) - pl.DataFrame( - [ - pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), - pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), - pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), - ] - ) - - >>> df_from_str_repr = eval(df.to_init_repr()) - >>> df_from_str_repr - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ f32 ┆ cat │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 7.0 ┆ b │ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: - ''' - Serialize to JSON representation. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - If set to `None` (default), the output is returned as a string instead. - pretty - Pretty serialize json. - row_oriented - Write to row oriented json. This is slower, but more common. - - See Also - -------- - DataFrame.write_ndjson - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... } - ... ) - >>> df.write_json() - \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' - >>> df.write_json(row_oriented=True) - \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' - - ''' - def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: - ''' - Serialize to newline delimited JSON representation. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - If set to `None` (default), the output is returned as a string instead. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... } - ... ) - >>> df.write_ndjson() - \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' - - ''' - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: - ''' - Write to comma-separated values (CSV) file. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - If set to `None` (default), the output is returned as a string instead. - include_bom - Whether to include UTF-8 BOM in the CSV output. - include_header - Whether to include header in the CSV output. - separator - Separate CSV fields with this symbol. - line_terminator - String used to end each row. - quote_char - Byte to use as quoting character. - batch_size - Number of rows that will be processed per thread. - datetime_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. If no format specified, the default fractional-second - precision is inferred from the maximum timeunit found in the frame\'s - Datetime cols (if any). - date_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - time_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - float_precision - Number of decimal places to write, applied to both `Float32` and - `Float64` datatypes. - null_value - A string representing null values (defaulting to the empty string). - quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} - Determines the quoting strategy used. - - - necessary (default): This puts quotes around fields only when necessary. - They are necessary when fields contain a quote, - separator or record terminator. - Quotes are also necessary when writing an empty record - (which is indistinguishable from a record with one empty field). - This is the default. - - always: This puts quotes around every field. Always. - - never: This never puts quotes around fields, even if that results in - invalid CSV data (e.g.: by not quoting strings containing the separator). - - non_numeric: This puts quotes around all fields that are non-numeric. - Namely, when writing a field that does not parse as a valid float - or integer, then quotes will be used even if they aren`t strictly - necessary. - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.csv" - >>> df.write_csv(path, separator=",") - - ''' - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: - ''' - Write to Apache Avro file. - - Parameters - ---------- - file - File path or writeable file-like object to which the data will be written. - compression : {\'uncompressed\', \'snappy\', \'deflate\'} - Compression method. Defaults to "uncompressed". - name - Schema name. Defaults to empty string. - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.avro" - >>> df.write_avro(path) - - ''' - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: - ''' - Write frame data to a table in an Excel workbook/worksheet. - - Parameters - ---------- - workbook : Workbook - String name or path of the workbook to create, BytesIO object to write - into, or an open `xlsxwriter.Workbook` object that has not been closed. - If None, writes to a `dataframe.xlsx` workbook in the working directory. - worksheet : str - Name of target worksheet; if None, writes to "Sheet1" when creating a new - workbook (note that writing to an existing workbook requires a valid - existing -or new- worksheet name). - position : {str, tuple} - Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. - table_style : {str, dict} - A named Excel table style, such as "Table Style Medium 4", or a dictionary - of `{"key":value,}` options containing one or more of the following keys: - "style", "first_column", "last_column", "banded_columns, "banded_rows". - table_name : str - Name of the output table object in the worksheet; can then be referred to - in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. - column_formats : dict - A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an - Excel format string to the given columns. Formats defined here (such as - "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. - dtype_formats : dict - A `{dtype:str,}` dictionary that sets the default Excel format for the - given dtype. (This can be overridden on a per-column basis by the - `column_formats` param). It is also valid to use dtype groups such as - `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform - integer and float formats. - conditional_formats : dict - A dictionary of colname (or selector) keys to a format str, dict, or list - that defines conditional formatting options for the specified columns. - - * If supplying a string typename, should be one of the valid `xlsxwriter` - types such as "3_color_scale", "data_bar", etc. - * If supplying a dictionary you can make use of any/all `xlsxwriter` - supported options, including icon sets, formulae, etc. - * Supplying multiple columns as a tuple/key will apply a single format - across all columns - this is effective in creating a heatmap, as the - min/max values will be determined across the entire range, not per-column. - * Finally, you can also supply a list made up from the above options - in order to apply *more* than one conditional format to the same range. - header_format : dict - A `{key:value,}` dictionary of `xlsxwriter` format options to apply - to the table header row, such as `{"bold":True, "font_color":"#702963"}`. - column_totals : {bool, list, dict} - Add a column-total row to the exported table. - - * If True, all numeric columns will have an associated total using "sum". - * If passing a string, it must be one of the valid total function names - and all numeric columns will have an associated total using that function. - * If passing a list of colnames, only those given will have a total. - * For more control, pass a `{colname:funcname,}` dict. - - Valid total function names are "average", "count_nums", "count", "max", - "min", "std_dev", "sum", and "var". - column_widths : {dict, int} - A `{colname:int,}` or `{selector:int,}` dict or a single integer that - sets (or overrides if autofitting) table column widths, in integer pixel - units. If given as an integer the same value is used for all table columns. - row_totals : {dict, bool} - Add a row-total column to the right-hand side of the exported table. - - * If True, a column called "total" will be added at the end of the table - that applies a "sum" function row-wise across all numeric columns. - * If passing a list/sequence of column names, only the matching columns - will participate in the sum. - * Can also pass a `{colname:columns,}` dictionary to create one or - more total columns with distinct names, referencing different columns. - row_heights : {dict, int} - An int or `{row_index:int,}` dictionary that sets the height of the given - rows (if providing a dictionary) or all rows (if providing an integer) that - intersect with the table body (including any header and total row) in - integer pixel units. Note that `row_index` starts at zero and will be - the header row (unless `include_header` is False). - sparklines : dict - A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more - sparklines to be written into a new column in the table. - - * If passing a list of colnames (used as the source of the sparkline data) - the default sparkline settings are used (eg: line chart with no markers). - * For more control an `xlsxwriter`-compliant options dict can be supplied, - in which case three additional polars-specific keys are available: - "columns", "insert_before", and "insert_after". These allow you to define - the source columns and position the sparkline(s) with respect to other - table columns. If no position directive is given, sparklines are added to - the end of the table (eg: to the far right) in the order they are given. - formulas : dict - A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or - more formulas to be written into a new column in the table. Note that you - are strongly advised to use structured references in your formulae wherever - possible to make it simple to reference columns by name. - - * If providing a string formula (such as "=[@colx]*[@coly]") the column will - be added to the end of the table (eg: to the far right), after any default - sparklines and before any row_totals. - * For the most control supply an options dictionary with the following keys: - "formula" (mandatory), one of "insert_before" or "insert_after", and - optionally "return_dtype". The latter is used to appropriately format the - output of the formula and allow it to participate in row/column totals. - float_precision : int - Default number of decimals displayed for floating point columns (note that - this is purely a formatting directive; the actual values are not rounded). - include_header : bool - Indicate if the table should be created with a header row. - autofilter : bool - If the table has headers, provide autofilter capability. - autofit : bool - Calculate individual column widths from the data. - hidden_columns : list - A list or selector representing table columns to hide in the worksheet. - hide_gridlines : bool - Do not display any gridlines on the output worksheet. - sheet_zoom : int - Set the default zoom level of the output worksheet. - freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) - Freeze workbook panes. - - * If (row, col) is supplied, panes are split at the top-left corner of the - specified cell, which are 0-indexed. Thus, to freeze only the top row, - supply (1, 0). - * Alternatively, cell notation can be used to supply the cell. For example, - "A2" indicates the split occurs at the top-left of cell A2, which is the - equivalent of (1, 0). - * If (row, col, top_row, top_col) are supplied, the panes are split based on - the `row` and `col`, and the scrolling region is inititalized to begin at - the `top_row` and `top_col`. Thus, to freeze only the top row and have the - scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). - Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. - - Notes - ----- - * A list of compatible `xlsxwriter` format property names can be found here: - https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties - - * Conditional formatting dictionaries should provide xlsxwriter-compatible - definitions; polars will take care of how they are applied on the worksheet - with respect to the relative sheet/column position. For supported options, - see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html - - * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible - key/values, as well as a mandatory polars "columns" key that defines the - sparkline source data; these source columns should all be adjacent. Two other - polars-specific keys are available to help define where the sparkline appears - in the table: "insert_after", and "insert_before". The value associated with - these keys should be the name of a column in the exported table. - https://xlsxwriter.readthedocs.io/working_with_sparklines.html - - * Formula dictionaries *must* contain a key called "formula", and then optional - "insert_after", "insert_before", and/or "return_dtype" keys. These additional - keys allow the column to be injected into the table at a specific location, - and/or to define the return type of the formula (eg: "Int64", "Float64", etc). - Formulas that refer to table columns should use Excel\'s structured references - syntax to ensure the formula is applied correctly and is table-relative. - https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e - - Examples - -------- - Instantiate a basic DataFrame: - - >>> from random import uniform - >>> from datetime import date - >>> - >>> df = pl.DataFrame( - ... { - ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], - ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], - ... "val": [10_000, 20_000, 30_000], - ... } - ... ) - - Export to "dataframe.xlsx" (the default workbook name, if not specified) in the - working directory, add column totals ("sum" by default) on all numeric columns, - then autofit: - - >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP - - Write frame to a specific location on the sheet, set a named table style, - apply US-style date formatting, increase default float precision, apply a - non-default total function to a single column, autofit: - - >>> df.write_excel( # doctest: +SKIP - ... position="B4", - ... table_style="Table Style Light 16", - ... dtype_formats={pl.Date: "mm/dd/yyyy"}, - ... column_totals={"num": "average"}, - ... float_precision=6, - ... autofit=True, - ... ) - - Write the same frame to a named worksheet twice, applying different styles - and conditional formatting to each table, adding table titles using explicit - xlsxwriter integration: - - >>> from xlsxwriter import Workbook - >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP - ... # basic/default conditional formatting - ... df.write_excel( - ... workbook=wb, - ... worksheet="data", - ... position=(3, 1), # specify position as (row,col) coordinates - ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, - ... table_style="Table Style Medium 4", - ... ) - ... - ... # advanced conditional formatting, custom styles - ... df.write_excel( - ... workbook=wb, - ... worksheet="data", - ... position=(len(df) + 7, 1), - ... table_style={ - ... "style": "Table Style Light 4", - ... "first_column": True, - ... }, - ... conditional_formats={ - ... "num": { - ... "type": "3_color_scale", - ... "min_color": "#76933c", - ... "mid_color": "#c4d79b", - ... "max_color": "#ebf1de", - ... }, - ... "val": { - ... "type": "data_bar", - ... "data_bar_2010": True, - ... "bar_color": "#9bbb59", - ... "bar_negative_color_same": True, - ... "bar_negative_border_color_same": True, - ... }, - ... }, - ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, - ... column_widths={"val": 125}, - ... autofit=True, - ... ) - ... - ... # add some table titles (with a custom format) - ... ws = wb.get_worksheet_by_name("data") - ... fmt_title = wb.add_format( - ... { - ... "font_color": "#4f6228", - ... "font_size": 12, - ... "italic": True, - ... "bold": True, - ... } - ... ) - ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) - ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) - ... - - Export a table containing two different types of sparklines. Use default - options for the "trend" sparkline and customised options (and positioning) - for the "+/-" win_loss sparkline, with non-default integer dtype formatting, - column totals, a subtle two-tone heatmap and hidden worksheet gridlines: - - >>> df = pl.DataFrame( - ... { - ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], - ... "q1": [100, 55, -20, 0, 35], - ... "q2": [30, -10, 15, 60, 20], - ... "q3": [-50, 0, 40, 80, 80], - ... "q4": [75, 55, 25, -10, -55], - ... } - ... ) - >>> df.write_excel( # doctest: +SKIP - ... table_style="Table Style Light 2", - ... # apply accounting format to all flavours of integer - ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, - ... sparklines={ - ... # default options; just provide source cols - ... "trend": ["q1", "q2", "q3", "q4"], - ... # customised sparkline type, with positioning directive - ... "+/-": { - ... "columns": ["q1", "q2", "q3", "q4"], - ... "insert_after": "id", - ... "type": "win_loss", - ... }, - ... }, - ... conditional_formats={ - ... # create a unified multi-column heatmap - ... ("q1", "q2", "q3", "q4"): { - ... "type": "2_color_scale", - ... "min_color": "#95b3d7", - ... "max_color": "#ffffff", - ... }, - ... }, - ... column_totals=["q1", "q2", "q3", "q4"], - ... row_totals=True, - ... hide_gridlines=True, - ... ) - - Export a table containing an Excel formula-based column that calculates a - standardised Z-score, showing use of structured references in conjunction - with positioning directives, column totals, and custom formatting. - - >>> df = pl.DataFrame( - ... { - ... "id": ["a123", "b345", "c567", "d789", "e101"], - ... "points": [99, 45, 50, 85, 35], - ... } - ... ) - >>> df.write_excel( # doctest: +SKIP - ... table_style={ - ... "style": "Table Style Medium 15", - ... "first_column": True, - ... }, - ... column_formats={ - ... "id": {"font": "Consolas"}, - ... "points": {"align": "center"}, - ... "z-score": {"align": "center"}, - ... }, - ... column_totals="average", - ... formulas={ - ... "z-score": { - ... # use structured references to refer to the table columns and \'totals\' row - ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", - ... "insert_after": "points", - ... "return_dtype": pl.Float64, - ... } - ... }, - ... hide_gridlines=True, - ... sheet_zoom=125, - ... ) - - ''' - def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: - ''' - Write to Arrow IPC binary stream or Feather file. - - See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. - - Parameters - ---------- - file - Path or writeable file-like object to which the IPC data will be - written. If set to `None`, the output is returned as a BytesIO object. - compression : {\'uncompressed\', \'lz4\', \'zstd\'} - Compression method. Defaults to "uncompressed". - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.arrow" - >>> df.write_ipc(path) - - ''' - def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: - ''' - Write to Arrow IPC record batch stream. - - See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. - - Parameters - ---------- - file - Path or writeable file-like object to which the IPC record batch data will - be written. If set to `None`, the output is returned as a BytesIO object. - compression : {\'uncompressed\', \'lz4\', \'zstd\'} - Compression method. Defaults to "uncompressed". - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.arrow" - >>> df.write_ipc_stream(path) - - ''' - def write_parquet(self, file: str | Path | BytesIO) -> None: - ''' - Write to Apache Parquet file. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} - Choose "zstd" for good compression performance. - Choose "lz4" for fast compression/decompression. - Choose "snappy" for more backwards compatibility guarantees - when you deal with older parquet readers. - compression_level - The level of compression to use. Higher compression means smaller files on - disk. - - - "gzip" : min-level: 0, max-level: 10. - - "brotli" : min-level: 0, max-level: 11. - - "zstd" : min-level: 1, max-level: 22. - - statistics - Write statistics to the parquet headers. This requires extra compute. - row_group_size - Size of the row groups in number of rows. Defaults to 512^2 rows. - use_pyarrow - Use C++ parquet implementation vs Rust parquet implementation. - At the moment C++ supports more features. - pyarrow_options - Arguments passed to `pyarrow.parquet.write_table`. - - If you pass `partition_cols` here, the dataset will be written - using `pyarrow.parquet.write_to_dataset`. - The `partition_cols` parameter leads to write the dataset to a directory. - Similar to Spark\'s partitioned datasets. - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.parquet" - >>> df.write_parquet(path) - - We can use pyarrow with use_pyarrow_write_to_dataset=True - to write partitioned datasets. The following example will - write the first row to ../watermark=1/*.parquet and the - other rows to ../watermark=2/*.parquet. - - >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) - >>> path: pathlib.Path = dirpath / "partitioned_object" - >>> df.write_parquet( - ... path, - ... use_pyarrow=True, - ... pyarrow_options={"partition_cols": ["watermark"]}, - ... ) - - ''' - def write_database(self, table_name: str, connection: str) -> None: - ''' - Write a polars frame to a database. - - Parameters - ---------- - table_name - Name of the table to create or append to in the target SQL database. - If your table name contains special characters, it should be quoted. - connection - Connection URI string, for example: - - * "postgresql://user:pass@server:port/database" - * "sqlite:////path/to/database.db" - if_exists : {\'append\', \'replace\', \'fail\'} - The insert mode. - \'replace\' will create a new database table, overwriting an existing one. - \'append\' will append to an existing table. - \'fail\' will fail if table already exists. - engine : {\'sqlalchemy\', \'adbc\'} - Select the engine used for writing the data. - ''' - def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: - ''' - Write DataFrame as delta table. - - Parameters - ---------- - target - URI of a table or a DeltaTable object. - mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} - How to handle existing data. - - * If \'error\', throw an error if the table already exists (default). - * If \'append\', will add new data. - * If \'overwrite\', will replace table with new data. - * If \'ignore\', will not write anything if table already exists. - overwrite_schema - If True, allows updating the schema of the table. - storage_options - Extra options for the storage backends supported by `deltalake`. - For cloud storages, this may include configurations for authentication etc. - - * See a list of supported storage options for S3 `here `__. - * See a list of supported storage options for GCS `here `__. - * See a list of supported storage options for Azure `here `__. - delta_write_options - Additional keyword arguments while writing a Delta lake Table. - See a list of supported write options `here `__. - - Raises - ------ - TypeError - If the DataFrame contains unsupported data types. - ArrowInvalidError - If the DataFrame contains data types that could not be cast to their - primitive type. - - Notes - ----- - The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` - are not supported by the delta protocol specification and will raise a - TypeError. - - Some other data types are not supported but have an associated `primitive type - `__ - to which they can be cast. This affects the following data types: - - - Unsigned integers - - :class:`Datetime` types with millisecond or nanosecond precision or with - time zone information - - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) - - Polars columns are always nullable. To write data to a delta table with - non-nullable columns, a custom pyarrow schema has to be passed to the - `delta_write_options`. See the last example below. - - Examples - -------- - Write a dataframe to the local filesystem as a Delta Lake table. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> table_path = "/path/to/delta-table/" - >>> df.write_delta(table_path) # doctest: +SKIP - - Append data to an existing Delta Lake table on the local filesystem. - Note that this will fail if the schema of the new data does not match the - schema of the existing table. - - >>> df.write_delta(table_path, mode="append") # doctest: +SKIP - - Overwrite a Delta Lake table as a new version. - If the schemas of the new and old data are the same, setting - `overwrite_schema` is not required. - - >>> existing_table_path = "/path/to/delta-table/" - >>> df.write_delta( - ... existing_table_path, mode="overwrite", overwrite_schema=True - ... ) # doctest: +SKIP - - Write a dataframe as a Delta Lake table to a cloud object store like S3. - - >>> table_path = "s3://bucket/prefix/to/delta-table/" - >>> df.write_delta( - ... table_path, - ... storage_options={ - ... "AWS_REGION": "THE_AWS_REGION", - ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", - ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", - ... }, - ... ) # doctest: +SKIP - - Write DataFrame as a Delta Lake table with non-nullable columns. - - >>> import pyarrow as pa - >>> existing_table_path = "/path/to/delta-table/" - >>> df.write_delta( - ... existing_table_path, - ... delta_write_options={ - ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) - ... }, - ... ) # doctest: +SKIP - - ''' - def estimated_size(self, unit: SizeUnit = ...) -> int | float: - ''' - Return an estimation of the total (heap) allocated size of the `DataFrame`. - - Estimated size is given in the specified unit (bytes by default). - - This estimation is the sum of the size of its buffers, validity, including - nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the - size of 2 arrays is not the sum of the sizes computed from this function. In - particular, [`StructArray`]\'s size is an upper bound. - - When an array is sliced, its allocated size remains constant because the buffer - unchanged. However, this function will yield a smaller number. This is because - this function returns the visible size of the buffer, not its total capacity. - - FFI buffers are included in this estimation. - - Parameters - ---------- - unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} - Scale the returned size to the given unit. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "x": list(reversed(range(1_000_000))), - ... "y": [v / 1000 for v in range(1_000_000)], - ... "z": [str(v) for v in range(1_000_000)], - ... }, - ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], - ... ) - >>> df.estimated_size() - 25888898 - >>> df.estimated_size("mb") - 24.689577102661133 - - ''' - def transpose(self) -> Self: - ''' - Transpose a DataFrame over the diagonal. - - Parameters - ---------- - include_header - If set, the column names will be added as first column. - header_name - If `include_header` is set, this determines the name of the column that will - be inserted. - column_names - Optional iterable yielding strings or a string naming an existing column. - These will name the value (non-header) columns in the transposed data. - - Notes - ----- - This is a very expensive operation. Perhaps you can do it differently. - - Returns - ------- - DataFrame - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) - >>> df.transpose(include_header=True) - shape: (2, 4) - ┌────────┬──────────┬──────────┬──────────┐ - │ column ┆ column_0 ┆ column_1 ┆ column_2 │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞════════╪══════════╪══════════╪══════════╡ - │ a ┆ 1 ┆ 2 ┆ 3 │ - │ b ┆ 1 ┆ 2 ┆ 3 │ - └────────┴──────────┴──────────┴──────────┘ - - Replace the auto-generated column names with a list - - >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 2 ┆ 3 │ - │ 1 ┆ 2 ┆ 3 │ - └─────┴─────┴─────┘ - - Include the header as a separate column - - >>> df.transpose( - ... include_header=True, header_name="foo", column_names=["a", "b", "c"] - ... ) - shape: (2, 4) - ┌─────┬─────┬─────┬─────┐ - │ foo ┆ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═════╡ - │ a ┆ 1 ┆ 2 ┆ 3 │ - │ b ┆ 1 ┆ 2 ┆ 3 │ - └─────┴─────┴─────┴─────┘ - - Replace the auto-generated column with column names from a generator function - - >>> def name_generator(): - ... base_name = "my_column_" - ... count = 0 - ... while True: - ... yield f"{base_name}{count}" - ... count += 1 - ... - >>> df.transpose(include_header=False, column_names=name_generator()) - shape: (2, 3) - ┌─────────────┬─────────────┬─────────────┐ - │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════════════╪═════════════╪═════════════╡ - │ 1 ┆ 2 ┆ 3 │ - │ 1 ┆ 2 ┆ 3 │ - └─────────────┴─────────────┴─────────────┘ - - Use an existing column as the new column names - - >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) - >>> df.transpose(column_names="id") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 3 ┆ 2 │ - │ 3 ┆ 4 ┆ 6 │ - └─────┴─────┴─────┘ - >>> df.transpose(include_header=True, header_name="new_id", column_names="id") - shape: (2, 4) - ┌────────┬─────┬─────┬─────┐ - │ new_id ┆ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞════════╪═════╪═════╪═════╡ - │ col1 ┆ 1 ┆ 3 ┆ 2 │ - │ col2 ┆ 3 ┆ 4 ┆ 6 │ - └────────┴─────┴─────┴─────┘ - ''' - def reverse(self) -> DataFrame: - ''' - Reverse the DataFrame. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "key": ["a", "b", "c"], - ... "val": [1, 2, 3], - ... } - ... ) - >>> df.reverse() - shape: (3, 2) - ┌─────┬─────┐ - │ key ┆ val │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ c ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 1 │ - └─────┴─────┘ - - ''' - def rename(self, mapping: dict[str, str]) -> DataFrame: - ''' - Rename column names. - - Parameters - ---------- - mapping - Key value pairs that map from old name to new name. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} - ... ) - >>> df.rename({"foo": "apple"}) - shape: (3, 3) - ┌───────┬─────┬─────┐ - │ apple ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═══════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └───────┴─────┴─────┘ - - ''' - def insert_column(self, index: int, column: Series) -> Self: - ''' - Insert a Series at a certain column index. - - This operation is in place. - - Parameters - ---------- - index - Index at which to insert the new `Series` column. - column - `Series` to insert. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> s = pl.Series("baz", [97, 98, 99]) - >>> df.insert_column(1, s) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ baz ┆ bar │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 97 ┆ 4 │ - │ 2 ┆ 98 ┆ 5 │ - │ 3 ┆ 99 ┆ 6 │ - └─────┴─────┴─────┘ - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) - >>> df.insert_column(3, s) - shape: (4, 4) - ┌─────┬──────┬───────┬──────┐ - │ a ┆ b ┆ c ┆ d │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 │ - ╞═════╪══════╪═══════╪══════╡ - │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ - │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ - │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ - │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ - └─────┴──────┴───────┴──────┘ - - ''' - def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: - ''' - Filter the rows in the DataFrame based on a predicate expression. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - predicates - Expression that evaluates to a boolean Series. - constraints - Column filters. Use name=value to filter column name by the supplied value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - - Filter on one condition: - - >>> df.filter(pl.col("foo") > 1) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Filter on multiple conditions, combined with and/or operators: - - >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Provide multiple filters using `*args` syntax: - - >>> df.filter( - ... pl.col("foo") <= 2, - ... ~pl.col("ham").is_in(["b", "c"]), - ... ) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Provide multiple filters using `**kwargs` syntax: - - >>> df.filter(foo=2, ham="b") - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - └─────┴─────┴─────┘ - - ''' - def glimpse(self) -> str | None: - ''' - Return a dense preview of the DataFrame. - - The formatting shows one line per column so that wide dataframes display - cleanly. Each line shows the column name, the data type, and the first - few values. - - Parameters - ---------- - max_items_per_column - Maximum number of items to show per column. - max_colname_length - Maximum length of the displayed column names; values that exceed this - value are truncated with a trailing ellipsis. - return_as_string - If True, return the preview as a string instead of printing to stdout. - - See Also - -------- - describe, head, tail - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... { - ... "a": [1.0, 2.8, 3.0], - ... "b": [4, 5, None], - ... "c": [True, False, True], - ... "d": [None, "b", "c"], - ... "e": ["usd", "eur", None], - ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], - ... } - ... ) - >>> df.glimpse() - Rows: 3 - Columns: 6 - $ a 1.0, 2.8, 3.0 - $ b 4, 5, None - $ c True, False, True - $ d None, \'b\', \'c\' - $ e \'usd\', \'eur\', None - $ f 2020-01-01, 2021-01-02, 2022-01-01 - - ''' - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: - ''' - Summary statistics for a DataFrame. - - Parameters - ---------- - percentiles - One or more percentiles to include in the summary statistics. - All values must be in the range `[0, 1]`. - - Notes - ----- - The median is included by default as the 50% percentile. - - See Also - -------- - glimpse - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... { - ... "a": [1.0, 2.8, 3.0], - ... "b": [4, 5, None], - ... "c": [True, False, True], - ... "d": [None, "b", "c"], - ... "e": ["usd", "eur", None], - ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], - ... } - ... ) - >>> df.describe() - shape: (9, 7) - ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ - │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ - ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ - │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ - │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ - │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ - │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ - │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ - │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ - │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ - │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ - │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ - └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ - - ''' - def get_column_index(self, name: str) -> int: - ''' - Find the index of a column by name. - - Parameters - ---------- - name - Name of the column to find. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} - ... ) - >>> df.get_column_index("ham") - 2 - - ''' - def replace_column(self, index: int, column: Series) -> Self: - ''' - Replace a column at an index location. - - This operation is in place. - - Parameters - ---------- - index - Column index. - column - Series that will replace the column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> s = pl.Series("apple", [10, 20, 30]) - >>> df.replace_column(0, s) - shape: (3, 3) - ┌───────┬─────┬─────┐ - │ apple ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═══════╪═════╪═════╡ - │ 10 ┆ 6 ┆ a │ - │ 20 ┆ 7 ┆ b │ - │ 30 ┆ 8 ┆ c │ - └───────┴─────┴─────┘ - ''' - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: - ''' - Sort the dataframe by the given columns. - - Parameters - ---------- - by - Column(s) to sort by. Accepts expression input. Strings are parsed as column - names. - *more_by - Additional columns to sort by, specified as positional arguments. - descending - Sort in descending order. When sorting by multiple columns, can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - - Examples - -------- - Pass a single column name to sort by that column. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [6.0, 5.0, 4.0], - ... "c": ["a", "c", "b"], - ... } - ... ) - >>> df.sort("a") - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - Sorting by expressions is also supported. - - >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - └──────┴─────┴─────┘ - - Sort by multiple columns by passing a list of columns. - - >>> df.sort(["c", "a"], descending=True) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - └──────┴─────┴─────┘ - - Or use positional arguments to sort by multiple columns in the same way. - - >>> df.sort("c", "a", descending=[False, True]) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - ''' - def top_k(self, k: int) -> DataFrame: - ''' - Return the `k` largest elements. - - If \'descending=True` the smallest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - See Also - -------- - bottom_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 largest values in column b. - - >>> df.top_k(4, by="b") - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ a ┆ 2 │ - │ b ┆ 2 │ - │ b ┆ 1 │ - └─────┴─────┘ - - Get the rows which contain the 4 largest values when sorting on column b and a. - - >>> df.top_k(4, by=["b", "a"]) - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 2 │ - │ c ┆ 1 │ - └─────┴─────┘ - - ''' - def bottom_k(self, k: int) -> DataFrame: - ''' - Return the `k` smallest elements. - - If \'descending=True` the largest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - See Also - -------- - top_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 smallest values in column b. - - >>> df.bottom_k(4, by="b") - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 1 │ - │ a ┆ 1 │ - │ c ┆ 1 │ - │ a ┆ 2 │ - └─────┴─────┘ - - Get the rows which contain the 4 smallest values when sorting on column a and b. - - >>> df.bottom_k(4, by=["a", "b"]) - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ b ┆ 1 │ - │ b ┆ 2 │ - └─────┴─────┘ - - ''' - def equals(self, other: DataFrame) -> bool: - ''' - Check whether the DataFrame is equal to another DataFrame. - - Parameters - ---------- - other - DataFrame to compare with. - null_equal - Consider null values as equal. - - See Also - -------- - assert_frame_equal - - Examples - -------- - >>> df1 = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df2 = pl.DataFrame( - ... { - ... "foo": [3, 2, 1], - ... "bar": [8.0, 7.0, 6.0], - ... "ham": ["c", "b", "a"], - ... } - ... ) - >>> df1.equals(df1) - True - >>> df1.equals(df2) - False - - ''' - def replace(self, column: str, new_column: Series) -> Self: - ''' - Replace a column by a new Series. - - Parameters - ---------- - column - Column to replace. - new_column - New column to insert. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> s = pl.Series([10, 20, 30]) - >>> df.replace("foo", s) # works in-place! # doctest: +SKIP - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 10 ┆ 4 │ - │ 20 ┆ 5 │ - │ 30 ┆ 6 │ - └─────┴─────┘ - - ''' - def slice(self, offset: int, length: int | None = ...) -> Self: - ''' - Get a slice of this DataFrame. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.slice(1, 2) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7.0 ┆ b │ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def head(self, n: int = ...) -> Self: - ''' - Get the first `n` rows. - - Parameters - ---------- - n - Number of rows to return. If a negative value is passed, return all rows - except the last `abs(n)`. - - See Also - -------- - tail, glimpse, slice - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> df.head(3) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Pass a negative value to get all rows `except` the last `abs(n)`. - - >>> df.head(-3) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - └─────┴─────┴─────┘ - - ''' - def tail(self, n: int = ...) -> Self: - ''' - Get the last `n` rows. - - Parameters - ---------- - n - Number of rows to return. If a negative value is passed, return all rows - except the first `abs(n)`. - - See Also - -------- - head, slice - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> df.tail(3) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8 ┆ c │ - │ 4 ┆ 9 ┆ d │ - │ 5 ┆ 10 ┆ e │ - └─────┴─────┴─────┘ - - Pass a negative value to get all rows `except` the first `abs(n)`. - - >>> df.tail(-3) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 4 ┆ 9 ┆ d │ - │ 5 ┆ 10 ┆ e │ - └─────┴─────┴─────┘ - - ''' - def limit(self, n: int = ...) -> Self: - """ - Get the first `n` rows. - - Alias for :func:`DataFrame.head`. - - Parameters - ---------- - n - Number of rows to return. If a negative value is passed, return all rows - except the last `abs(n)`. - - See Also - -------- - head - - """ - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: - ''' - Drop all rows that contain null values. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - subset - Column name(s) for which null values are considered. - If set to `None` (default), use all columns. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, None, 8], - ... "ham": ["a", "b", None], - ... } - ... ) - - The default behavior of this method is to drop rows where any single - value of the row is null. - - >>> df.drop_nulls() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - This behaviour can be constrained to consider only a subset of columns, as - defined by name or with a selector. For example, dropping rows if there is - a null in any of the integer columns: - - >>> import polars.selectors as cs - >>> df.drop_nulls(subset=cs.integer()) - shape: (2, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ null │ - └─────┴─────┴──────┘ - - Below are some additional examples that show how to drop null - values based on other conditions. - - >>> df = pl.DataFrame( - ... { - ... "a": [None, None, None, None], - ... "b": [1, 2, None, 1], - ... "c": [1, None, None, 1], - ... } - ... ) - >>> df - shape: (4, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪══════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ null ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴──────┴──────┘ - - Drop a row only if all values are null: - - >>> df.filter(~pl.all_horizontal(pl.all().is_null())) - shape: (3, 3) - ┌──────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪═════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴─────┴──────┘ - - Drop a column if all values are null: - - >>> df[[s.name for s in df if not (s.null_count() == df.height)]] - shape: (4, 2) - ┌──────┬──────┐ - │ b ┆ c │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ 1 ┆ 1 │ - │ 2 ┆ null │ - │ null ┆ null │ - │ 1 ┆ 1 │ - └──────┴──────┘ - - ''' - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: - ''' - Offers a structured way to apply a sequence of user-defined functions (UDFs). - - Parameters - ---------- - function - Callable; will receive the frame as the first parameter, - followed by any given args/kwargs. - *args - Arguments to pass to the UDF. - **kwargs - Keyword arguments to pass to the UDF. - - Notes - ----- - It is recommended to use LazyFrame when piping operations, in order - to fully take advantage of query optimization and parallelization. - See :meth:`df.lazy() `. - - Examples - -------- - >>> def cast_str_to_int(data, col_name): - ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) - ... - >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) - >>> df.pipe(cast_str_to_int, col_name="b") - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 10 │ - │ 2 ┆ 20 │ - │ 3 ┆ 30 │ - │ 4 ┆ 40 │ - └─────┴─────┘ - - >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) - >>> df - shape: (2, 2) - ┌─────┬─────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - └─────┴─────┘ - >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 1 │ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: - ''' - Add a column at index 0 that counts the rows. - - Parameters - ---------- - name - Name of the column to add. - offset - Start the row count at this offset. Default = 0 - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> df.with_row_count() - shape: (3, 3) - ┌────────┬─────┬─────┐ - │ row_nr ┆ a ┆ b │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ i64 ┆ i64 │ - ╞════════╪═════╪═════╡ - │ 0 ┆ 1 ┆ 2 │ - │ 1 ┆ 3 ┆ 4 │ - │ 2 ┆ 5 ┆ 6 │ - └────────┴─────┴─────┘ - - ''' - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: - ''' - Start a group by operation. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - .. note:: - Within each group, the order of rows is always preserved, regardless - of this argument. - - Returns - ------- - GroupBy - Object which can be used to perform aggregations. - - Examples - -------- - Group by one column and call `agg` to compute the grouped sum of another - column. - - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "c"], - ... "b": [1, 2, 1, 3, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 2 │ - │ b ┆ 5 │ - │ c ┆ 3 │ - └─────┴─────┘ - - Set `maintain_order=True` to ensure the order of the groups is consistent with - the input. - - >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) - shape: (3, 2) - ┌─────┬───────────┐ - │ a ┆ c │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════╪═══════════╡ - │ a ┆ [5, 3] │ - │ b ┆ [4, 2] │ - │ c ┆ [1] │ - └─────┴───────────┘ - - Group by multiple columns by passing a list of column names. - - >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - - Or use positional arguments to group by multiple columns in the same way. - Expressions are also accepted. - - >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ f64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 0 ┆ 4.0 │ - │ b ┆ 1 ┆ 3.0 │ - │ c ┆ 1 ┆ 1.0 │ - └─────┴─────┴─────┘ - - The `GroupBy` object returned by this method is iterable, returning the name - and data of each group. - - >>> for name, data in df.group_by("a"): # doctest: +SKIP - ... print(name) - ... print(data) - ... - a - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘ - b - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘ - c - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - - ''' - def rolling(self, index_column: IntoExpr) -> RollingGroupBy: - ''' - Create rolling groups based on a time, Int32, or Int64 column. - - Different from a `group_by_dynamic` the windows are now determined by the - individual values and are not of constant intervals. For constant intervals use - :func:`DataFrame.group_by_dynamic`. - - If you have a time series ``, then by default the - windows created will be - - * (t_0 - period, t_0] - * (t_1 - period, t_1] - * ... - * (t_n - period, t_n] - - whereas if you pass a non-default `offset`, then the windows will be - - * (t_0 + offset, t_0 + offset + period] - * (t_1 + offset, t_1 + offset + period] - * ... - * (t_n + offset, t_n + offset + period] - - The `period` and `offset` arguments are created either from a timedelta, or - by using the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a rolling operation on an integer column, the windows are defined by: - - - **"1i" # length 1** - - **"10i" # length 10** - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling operation on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - RollingGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - group_by_dynamic - - Examples - -------- - >>> dates = [ - ... "2020-01-01 13:45:48", - ... "2020-01-01 16:42:13", - ... "2020-01-01 16:45:09", - ... "2020-01-02 18:12:48", - ... "2020-01-03 19:45:32", - ... "2020-01-08 23:16:43", - ... ] - >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( - ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() - ... ) - >>> out = df.rolling(index_column="dt", period="2d").agg( - ... [ - ... pl.sum("a").alias("sum_a"), - ... pl.min("a").alias("min_a"), - ... pl.max("a").alias("max_a"), - ... ] - ... ) - >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] - >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] - >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] - >>> out - shape: (6, 4) - ┌─────────────────────┬───────┬───────┬───────┐ - │ dt ┆ sum_a ┆ min_a ┆ max_a │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞═════════════════════╪═══════╪═══════╪═══════╡ - │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ - │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ - │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ - │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ - │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ - │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ - └─────────────────────┴───────┴───────┴───────┘ - - ''' - def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - Time windows are calculated and rows are assigned to windows. Different from a - normal group by is that a row can be member of multiple groups. - By default, the windows look like: - - - [start, start + period) - - [start + every, start + every + period) - - [start + 2*every, start + 2*every + period) - - ... - - where `start` is determined by `start_by`, `offset`, and `every` (see parameter - descriptions below). - - .. warning:: - The index column must be sorted in ascending order. If `by` is passed, then - the index column must be sorted in ascending order within each group. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - - .. deprecated:: 0.19.4 - Use `label` instead. - include_boundaries - Add the lower and upper bound of the window to the "_lower_boundary" and - "_upper_boundary" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - label : {\'left\', \'right\', \'datapoint\'} - Define which label to use for the window: - - - \'left\': lower boundary of the window - - \'right\': upper boundary of the window - - \'datapoint\': the first value of the index column in the given window. - If you don\'t need the label to be at one of the boundaries, choose this - option for maximum performance - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - DynamicGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - rolling - - Notes - ----- - 1) If you\'re coming from pandas, then - - .. code-block:: python - - # polars - df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) - - is equivalent to - - .. code-block:: python - - # pandas - df.set_index("ts").resample("D")["value"].sum().reset_index() - - though note that, unlike pandas, polars doesn\'t add extra rows for empty - windows. If you need `index_column` to be evenly spaced, then please combine - with :func:`DataFrame.upsample`. - - 2) The `every`, `period` and `offset` arguments are created with - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a group_by_dynamic on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Examples - -------- - >>> from datetime import datetime - >>> df = pl.DataFrame( - ... { - ... "time": pl.datetime_range( - ... start=datetime(2021, 12, 16), - ... end=datetime(2021, 12, 16, 3), - ... interval="30m", - ... eager=True, - ... ), - ... "n": range(7), - ... } - ... ) - >>> df - shape: (7, 2) - ┌─────────────────────┬─────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i64 │ - ╞═════════════════════╪═════╡ - │ 2021-12-16 00:00:00 ┆ 0 │ - │ 2021-12-16 00:30:00 ┆ 1 │ - │ 2021-12-16 01:00:00 ┆ 2 │ - │ 2021-12-16 01:30:00 ┆ 3 │ - │ 2021-12-16 02:00:00 ┆ 4 │ - │ 2021-12-16 02:30:00 ┆ 5 │ - │ 2021-12-16 03:00:00 ┆ 6 │ - └─────────────────────┴─────┘ - - Group by windows of 1 hour starting at 2021-12-16 00:00:00. - - >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [1, 2] │ - │ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ 2021-12-16 02:00:00 ┆ [5, 6] │ - └─────────────────────┴───────────┘ - - The window boundaries can also be added to the aggregation result - - >>> df.group_by_dynamic( - ... "time", every="1h", include_boundaries=True, closed="right" - ... ).agg(pl.col("n").mean()) - shape: (4, 4) - ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ - │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ - ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ - │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ - │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ - │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ - │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ - └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ - - When closed="left", the window excludes the right end of interval: - [lower_bound, upper_bound) - - >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-16 00:00:00 ┆ [0, 1] │ - │ 2021-12-16 01:00:00 ┆ [2, 3] │ - │ 2021-12-16 02:00:00 ┆ [4, 5] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - When closed="both" the time values at the window boundaries belong to 2 groups. - - >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) - shape: (5, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ - │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - Dynamic group bys can also be combined with grouping on normal keys - - >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) - >>> df - shape: (7, 3) - ┌─────────────────────┬─────┬────────┐ - │ time ┆ n ┆ groups │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ str │ - ╞═════════════════════╪═════╪════════╡ - │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ - │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ - │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ - │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ - │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ - │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ - │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ - └─────────────────────┴─────┴────────┘ - >>> df.group_by_dynamic( - ... "time", - ... every="1h", - ... closed="both", - ... by="groups", - ... include_boundaries=True, - ... ).agg(pl.col("n")) - shape: (7, 5) - ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ - │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ - ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ - │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ - │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ - │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ - │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ - │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ - └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ - - Dynamic group by on an index column - - >>> df = pl.DataFrame( - ... { - ... "idx": pl.int_range(0, 6, eager=True), - ... "A": ["A", "A", "B", "B", "B", "C"], - ... } - ... ) - >>> ( - ... df.group_by_dynamic( - ... "idx", - ... every="2i", - ... period="3i", - ... include_boundaries=True, - ... closed="right", - ... ).agg(pl.col("A").alias("A_agg_list")) - ... ) - shape: (4, 4) - ┌─────────────────┬─────────────────┬─────┬─────────────────┐ - │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 ┆ list[str] │ - ╞═════════════════╪═════════════════╪═════╪═════════════════╡ - │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ - │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ - │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ - │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ - └─────────────────┴─────────────────┴─────┴─────────────────┘ - - ''' - def upsample(self, time_column: str) -> Self: - ''' - Upsample a DataFrame at a regular frequency. - - The `every` and `offset` arguments are created with - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - - - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - Parameters - ---------- - time_column - time column will be used to determine a date_range. - Note that this column has to be sorted for the output to make sense. - every - interval will start \'every\' duration - offset - change the start of the date_range by this offset. - by - First group by these columns and then upsample for every group - maintain_order - Keep the ordering predictable. This is slower. - - Returns - ------- - DataFrame - Result will be sorted by `time_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - Examples - -------- - Upsample a DataFrame by a certain interval. - - >>> from datetime import datetime - >>> df = pl.DataFrame( - ... { - ... "time": [ - ... datetime(2021, 2, 1), - ... datetime(2021, 4, 1), - ... datetime(2021, 5, 1), - ... datetime(2021, 6, 1), - ... ], - ... "groups": ["A", "B", "A", "B"], - ... "values": [0, 1, 2, 3], - ... } - ... ).set_sorted("time") - >>> df.upsample( - ... time_column="time", every="1mo", by="groups", maintain_order=True - ... ).select(pl.all().forward_fill()) - shape: (7, 3) - ┌─────────────────────┬────────┬────────┐ - │ time ┆ groups ┆ values │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ str ┆ i64 │ - ╞═════════════════════╪════════╪════════╡ - │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ - │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ - │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ - │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ - │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ - │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ - │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ - └─────────────────────┴────────┴────────┘ - - ''' - def join_asof(self, other: DataFrame) -> DataFrame: - ''' - Perform an asof join. - - This is similar to a left-join except that we match on nearest key rather than - equal keys. - - Both DataFrames must be sorted by the asof_join key. - - For each row in the left DataFrame: - - - A "backward" search selects the last row in the right DataFrame whose - \'on\' key is less than or equal to the left\'s key. - - - A "forward" search selects the first row in the right DataFrame whose - \'on\' key is greater than or equal to the left\'s key. - - - A "nearest" search selects the last row in the right DataFrame whose value - is nearest to the left\'s key. String keys are not currently supported for a - nearest search. - - The default is "backward". - - Parameters - ---------- - other - Lazy DataFrame to join with. - left_on - Join column of the left DataFrame. - right_on - Join column of the right DataFrame. - on - Join column of both DataFrames. If set, `left_on` and `right_on` should be - None. - by - join on these columns before doing asof join - by_left - join on these columns before doing asof join - by_right - join on these columns before doing asof join - strategy : {\'backward\', \'forward\', \'nearest\'} - Join strategy. - suffix - Suffix to append to columns with a duplicate name. - tolerance - Numeric tolerance. By setting this the join will only be done if the near - keys are within this distance. If an asof join is done on columns of dtype - "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta - object or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - allow_parallel - Allow the physical plan to optionally evaluate the computation of both - DataFrames up to the join in parallel. - force_parallel - Force the physical plan to evaluate the computation of both DataFrames up to - the join in parallel. - - Examples - -------- - >>> from datetime import datetime - >>> gdp = pl.DataFrame( - ... { - ... "date": [ - ... datetime(2016, 1, 1), - ... datetime(2017, 1, 1), - ... datetime(2018, 1, 1), - ... datetime(2019, 1, 1), - ... ], # note record date: Jan 1st (sorted!) - ... "gdp": [4164, 4411, 4566, 4696], - ... } - ... ).set_sorted("date") - >>> population = pl.DataFrame( - ... { - ... "date": [ - ... datetime(2016, 5, 12), - ... datetime(2017, 5, 12), - ... datetime(2018, 5, 12), - ... datetime(2019, 5, 12), - ... ], # note record date: May 12th (sorted!) - ... "population": [82.19, 82.66, 83.12, 83.52], - ... } - ... ).set_sorted("date") - >>> population.join_asof(gdp, on="date", strategy="backward") - shape: (4, 3) - ┌─────────────────────┬────────────┬──────┐ - │ date ┆ population ┆ gdp │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ f64 ┆ i64 │ - ╞═════════════════════╪════════════╪══════╡ - │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ - │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ - │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ - │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ - └─────────────────────┴────────────┴──────┘ - - ''' - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: - ''' - Join in SQL-like fashion. - - Parameters - ---------- - other - DataFrame to join with. - on - Name(s) of the join columns in both DataFrames. - how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} - Join strategy. - - .. note:: - A left join preserves the row order of the left DataFrame. - left_on - Name(s) of the left join column(s). - right_on - Name(s) of the right join column(s). - suffix - Suffix to append to columns with a duplicate name. - validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} - Checks if join is of specified type. - - * *many_to_many* - “m:m”: default, does not result in checks - * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets - * *one_to_many* - “1:m”: check if join keys are unique in left dataset - * *many_to_one* - “m:1”: check if join keys are unique in right dataset - - .. note:: - - - This is currently not supported the streaming engine. - - This is only supported when joined by single columns. - - Returns - ------- - DataFrame - - See Also - -------- - join_asof - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> other_df = pl.DataFrame( - ... { - ... "apple": ["x", "y", "z"], - ... "ham": ["a", "b", "d"], - ... } - ... ) - >>> df.join(other_df, on="ham") - shape: (2, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - └─────┴─────┴─────┴───────┘ - - >>> df.join(other_df, on="ham", how="outer") - shape: (4, 4) - ┌──────┬──────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞══════╪══════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ null ┆ null ┆ d ┆ z │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └──────┴──────┴─────┴───────┘ - - >>> df.join(other_df, on="ham", how="left") - shape: (3, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └─────┴─────┴─────┴───────┘ - - >>> df.join(other_df, on="ham", how="semi") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 7.0 ┆ b │ - └─────┴─────┴─────┘ - - >>> df.join(other_df, on="ham", how="anti") - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - Notes - ----- - For joining on columns with categorical data, see `pl.StringCache()`. - - ''' - def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: - ''' - Apply a custom/user-defined function (UDF) over the rows of the DataFrame. - - .. warning:: - This method is much slower than the native expressions API. - Only use it if you cannot implement your logic otherwise. - - The UDF will receive each row as a tuple of values: `udf(row)`. - - Implementing logic using a Python function is almost always *significantly* - slower and more memory intensive than implementing the same logic using - the native expression API because: - - - The native expression engine runs in Rust; UDFs run in Python. - - Use of Python UDFs forces the DataFrame to be materialized in memory. - - Polars-native expressions can be parallelised (UDFs typically cannot). - - Polars-native expressions can be logically optimised (UDFs cannot). - - Wherever possible you should strongly prefer the native expression API - to achieve the best performance. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output type of the operation. If none given, Polars tries to infer the type. - inference_size - Only used in the case when the custom function returns rows. - This uses the first `n` rows to determine the output schema. - - Notes - ----- - * The frame-level `apply` cannot track column names (as the UDF is a black-box - that may arbitrarily drop, rearrange, transform, or add new columns); if you - want to apply a UDF such that column names are preserved, you should use the - expression-level `apply` syntax instead. - - * If your function is expensive and you don\'t want it to be called more than - once for a given input, consider applying an `@lru_cache` decorator to it. - If your data is suitable you may achieve *significant* speedups. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) - - Return a DataFrame by mapping each row to a tuple: - - >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) - shape: (3, 2) - ┌──────────┬──────────┐ - │ column_0 ┆ column_1 │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════════╪══════════╡ - │ 2 ┆ -3 │ - │ 4 ┆ 15 │ - │ 6 ┆ 24 │ - └──────────┴──────────┘ - - However, it is much better to implement this with a native expression: - - >>> df.select( - ... pl.col("foo") * 2, - ... pl.col("bar") * 3, - ... ) # doctest: +IGNORE_RESULT - - Return a DataFrame with a single column by mapping each row to a scalar: - - >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP - shape: (3, 1) - ┌───────┐ - │ apply │ - │ --- │ - │ i64 │ - ╞═══════╡ - │ 1 │ - │ 9 │ - │ 14 │ - └───────┘ - - In this case it is better to use the following native expression: - - >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT - - ''' - def hstack(self, columns: list[Series] | DataFrame) -> Self: - ''' - Return a new DataFrame grown horizontally by stacking multiple Series to it. - - Parameters - ---------- - columns - Series to stack. - in_place - Modify in place. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> x = pl.Series("apple", [10, 20, 30]) - >>> df.hstack([x]) - shape: (3, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6 ┆ a ┆ 10 │ - │ 2 ┆ 7 ┆ b ┆ 20 │ - │ 3 ┆ 8 ┆ c ┆ 30 │ - └─────┴─────┴─────┴───────┘ - - ''' - def vstack(self, other: DataFrame) -> Self: - ''' - Grow this DataFrame vertically by stacking a DataFrame to it. - - Parameters - ---------- - other - DataFrame to stack. - in_place - Modify in place. - - See Also - -------- - extend - - Examples - -------- - >>> df1 = pl.DataFrame( - ... { - ... "foo": [1, 2], - ... "bar": [6, 7], - ... "ham": ["a", "b"], - ... } - ... ) - >>> df2 = pl.DataFrame( - ... { - ... "foo": [3, 4], - ... "bar": [8, 9], - ... "ham": ["c", "d"], - ... } - ... ) - >>> df1.vstack(df2) - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - │ 4 ┆ 9 ┆ d │ - └─────┴─────┴─────┘ - - ''' - def extend(self, other: DataFrame) -> Self: - ''' - Extend the memory backed by this `DataFrame` with the values from `other`. - - Different from `vstack` which adds the chunks from `other` to the chunks of - this `DataFrame`, `extend` appends the data from `other` to the underlying - memory locations and thus may cause a reallocation. - - If this does not cause a reallocation, the resulting data structure will not - have any extra chunks and thus will yield faster queries. - - Prefer `extend` over `vstack` when you want to do a query after a single - append. For instance, during online operations where you add `n` rows and rerun - a query. - - Prefer `vstack` over `extend` when you want to append many times before - doing a query. For instance, when you read in multiple files and want to store - them in a single `DataFrame`. In the latter case, finish the sequence of - `vstack` operations with a `rechunk`. - - Parameters - ---------- - other - DataFrame to vertically add. - - Warnings - -------- - This method modifies the dataframe in-place. The dataframe is returned for - convenience only. - - See Also - -------- - vstack - - Examples - -------- - >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) - >>> df1.extend(df2) - shape: (6, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 5 │ - │ 3 ┆ 6 │ - │ 10 ┆ 40 │ - │ 20 ┆ 50 │ - │ 30 ┆ 60 │ - └─────┴─────┘ - - ''' - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: - ''' - Remove columns from the dataframe. - - Parameters - ---------- - columns - Names of the columns that should be removed from the dataframe, or - a selector that determines the columns to drop. - *more_columns - Additional columns to drop, specified as positional arguments. - - Examples - -------- - Drop a single column by passing the name of that column. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.drop("ham") - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪═════╡ - │ 1 ┆ 6.0 │ - │ 2 ┆ 7.0 │ - │ 3 ┆ 8.0 │ - └─────┴─────┘ - - Drop multiple columns by passing a list of column names. - - >>> df.drop(["bar", "ham"]) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - Drop multiple columns by passing a selector. - - >>> import polars.selectors as cs - >>> df.drop(cs.numeric()) - shape: (3, 1) - ┌─────┐ - │ ham │ - │ --- │ - │ str │ - ╞═════╡ - │ a │ - │ b │ - │ c │ - └─────┘ - - Use positional arguments to drop multiple columns. - - >>> df.drop("foo", "ham") - shape: (3, 1) - ┌─────┐ - │ bar │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 6.0 │ - │ 7.0 │ - │ 8.0 │ - └─────┘ - - ''' - def drop_in_place(self, name: str) -> Series: - ''' - Drop a single column in-place and return the dropped column. - - Parameters - ---------- - name - Name of the column to drop. - - Returns - ------- - Series - The dropped column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.drop_in_place("ham") - shape: (3,) - Series: \'ham\' [str] - [ - "a" - "b" - "c" - ] - - ''' - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: - ''' - Cast DataFrame column(s) to the specified dtype(s). - - Parameters - ---------- - dtypes - Mapping of column names (or selector) to dtypes, or a single dtype - to which all columns will be cast. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], - ... } - ... ) - - Cast specific frame columns to the specified dtypes: - - >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ u8 ┆ date │ - ╞═════╪═════╪════════════╡ - │ 1.0 ┆ 6 ┆ 2020-01-02 │ - │ 2.0 ┆ 7 ┆ 2021-03-04 │ - │ 3.0 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - Cast all frame columns to the specified dtype: - - >>> df.cast(pl.Utf8).to_dict(as_series=False) - {\'foo\': [\'1\', \'2\', \'3\'], - \'bar\': [\'6.0\', \'7.0\', \'8.0\'], - \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} - - Use selectors to define the columns being cast: - - >>> import polars.selectors as cs - >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ str │ - ╞═════╪═════╪════════════╡ - │ 1 ┆ 6 ┆ 2020-01-02 │ - │ 2 ┆ 7 ┆ 2021-03-04 │ - │ 3 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - ''' - def clear(self, n: int = ...) -> Self: - ''' - Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. - - Returns a `n`-row null-filled DataFrame with an identical schema. - `n` can be greater than the current number of rows in the DataFrame. - - Parameters - ---------- - n - Number of (null-filled) rows to return in the cleared frame. - - See Also - -------- - clone : Cheap deepcopy/clone. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> df.clear() - shape: (0, 3) - ┌─────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞═════╪═════╪══════╡ - └─────┴─────┴──────┘ - - >>> df.clear(n=2) - shape: (2, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪══════╪══════╡ - │ null ┆ null ┆ null │ - │ null ┆ null ┆ null │ - └──────┴──────┴──────┘ - - ''' - def clone(self) -> Self: - ''' - Create a copy of this DataFrame. - - This is a cheap operation that does not copy data. - - See Also - -------- - clear : Create an empty copy of the current DataFrame, with identical - schema but no data. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.clone() - shape: (4, 3) - ┌─────┬──────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true │ - │ 2 ┆ 4.0 ┆ true │ - │ 3 ┆ 10.0 ┆ false │ - │ 4 ┆ 13.0 ┆ true │ - └─────┴──────┴───────┘ - - ''' - def get_columns(self) -> list[Series]: - ''' - Get the DataFrame as a List of Series. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.get_columns() - [shape: (3,) - Series: \'foo\' [i64] - [ - 1 - 2 - 3 - ], shape: (3,) - Series: \'bar\' [i64] - [ - 4 - 5 - 6 - ]] - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.get_columns() - [shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - ], shape: (4,) - Series: \'b\' [f64] - [ - 0.5 - 4.0 - 10.0 - 13.0 - ], shape: (4,) - Series: \'c\' [bool] - [ - true - true - false - true - ]] - - ''' - def get_column(self, name: str) -> Series: - ''' - Get a single column by name. - - Parameters - ---------- - name : str - Name of the column to retrieve. - - Returns - ------- - Series - - See Also - -------- - to_series - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.get_column("foo") - shape: (3,) - Series: \'foo\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: - ''' - Fill null values using the specified value or strategy. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - matches_supertype - Fill all matching supertype of the fill `value`. - - Returns - ------- - DataFrame - DataFrame with None values replaced by the filling strategy. - - See Also - -------- - fill_nan - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 4], - ... "b": [0.5, 4, None, 13], - ... } - ... ) - >>> df.fill_null(99) - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 99 ┆ 99.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - >>> df.fill_null(strategy="forward") - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> df.fill_null(strategy="max") - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> df.fill_null(strategy="zero") - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 0 ┆ 0.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - ''' - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: - ''' - Fill floating point NaN values by an Expression evaluation. - - Parameters - ---------- - value - Value with which to replace NaN values. - - Returns - ------- - DataFrame - DataFrame with NaN values replaced by the given value. - - Warnings - -------- - Note that floating point NaNs (Not a Number) are not missing values! - To replace missing values, use :func:`fill_null`. - - See Also - -------- - fill_null - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1.5, 2, float("nan"), 4], - ... "b": [0.5, 4, float("nan"), 13], - ... } - ... ) - >>> df.fill_nan(99) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪══════╡ - │ 1.5 ┆ 0.5 │ - │ 2.0 ┆ 4.0 │ - │ 99.0 ┆ 99.0 │ - │ 4.0 ┆ 13.0 │ - └──────┴──────┘ - - ''' - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: - ''' - Explode the dataframe to long format by exploding the given columns. - - Parameters - ---------- - columns - Column names, expressions, or a selector defining them. The underlying - columns being exploded must be of List or Utf8 datatype. - *more_columns - Additional names of columns to explode, specified as positional arguments. - - Returns - ------- - DataFrame - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "letters": ["a", "a", "b", "c"], - ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], - ... } - ... ) - >>> df - shape: (4, 2) - ┌─────────┬───────────┐ - │ letters ┆ numbers │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════════╪═══════════╡ - │ a ┆ [1] │ - │ a ┆ [2, 3] │ - │ b ┆ [4, 5] │ - │ c ┆ [6, 7, 8] │ - └─────────┴───────────┘ - >>> df.explode("numbers") - shape: (8, 2) - ┌─────────┬─────────┐ - │ letters ┆ numbers │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════════╪═════════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ a ┆ 3 │ - │ b ┆ 4 │ - │ b ┆ 5 │ - │ c ┆ 6 │ - │ c ┆ 7 │ - │ c ┆ 8 │ - └─────────┴─────────┘ - - ''' - def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: - ''' - Create a spreadsheet-style pivot table as a DataFrame. - - Only available in eager mode. See "Examples" section below for how to do a - "lazy pivot" if you know the unique column values in advance. - - Parameters - ---------- - values - Column values to aggregate. Can be multiple columns if the *columns* - arguments contains multiple columns as well. - index - One or multiple keys to group by. - columns - Name of the column(s) whose values will be used as the header of the output - DataFrame. - aggregate_function - Choose from: - - - None: no aggregation takes place, will raise error if multiple values are in group. - - A predefined aggregate function string, one of - {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} - - An expression to do the aggregation. - - maintain_order - Sort the grouped keys so that the output order is predictable. - sort_columns - Sort the transposed columns by name. Default is by order of discovery. - separator - Used as separator/delimiter in generated column names. - - Returns - ------- - DataFrame - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": ["one", "one", "two", "two", "one", "two"], - ... "bar": ["y", "y", "y", "x", "x", "x"], - ... "baz": [1, 2, 3, 4, 5, 6], - ... } - ... ) - >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ y ┆ x │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ one ┆ 3 ┆ 5 │ - │ two ┆ 3 ┆ 10 │ - └─────┴─────┴─────┘ - - Pivot using selectors to determine the index/values/columns: - - >>> import polars.selectors as cs - >>> df.pivot( - ... values=cs.numeric(), - ... index=cs.string(), - ... columns=cs.string(), - ... aggregate_function="sum", - ... sort_columns=True, - ... ).sort( - ... by=cs.string(), - ... ) - shape: (4, 6) - ┌─────┬─────┬──────┬──────┬──────┬──────┐ - │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪══════╪══════╪══════╪══════╡ - │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ - │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ - │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ - │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ - └─────┴─────┴──────┴──────┴──────┴──────┘ - - Run an expression as aggregation function - - >>> df = pl.DataFrame( - ... { - ... "col1": ["a", "a", "a", "b", "b", "b"], - ... "col2": ["x", "x", "x", "x", "y", "y"], - ... "col3": [6, 7, 3, 2, 5, 7], - ... } - ... ) - >>> df.pivot( - ... index="col1", - ... columns="col2", - ... values="col3", - ... aggregate_function=pl.element().tanh().mean(), - ... ) - shape: (2, 3) - ┌──────┬──────────┬──────────┐ - │ col1 ┆ x ┆ y │ - │ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 │ - ╞══════╪══════════╪══════════╡ - │ a ┆ 0.998347 ┆ null │ - │ b ┆ 0.964028 ┆ 0.999954 │ - └──────┴──────────┴──────────┘ - - Note that `pivot` is only available in eager mode. If you know the unique - column values in advance, you can use :meth:`polars.LazyFrame.groupby` to - get the same result as above in lazy mode: - - >>> index = pl.col("col1") - >>> columns = pl.col("col2") - >>> values = pl.col("col3") - >>> unique_column_values = ["x", "y"] - >>> aggregate_function = lambda col: col.tanh().mean() - >>> ( - ... df.lazy() - ... .group_by(index) - ... .agg( - ... *[ - ... aggregate_function(values.filter(columns == value)).alias(value) - ... for value in unique_column_values - ... ] - ... ) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - shape: (2, 3) - ┌──────┬──────────┬──────────┐ - │ col1 ┆ x ┆ y │ - │ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 │ - ╞══════╪══════════╪══════════╡ - │ a ┆ 0.998347 ┆ null │ - │ b ┆ 0.964028 ┆ 0.999954 │ - └──────┴──────────┴──────────┘ - - ''' - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: - ''' - Unpivot a DataFrame from wide to long format. - - Optionally leaves identifiers set. - - This function is useful to massage a DataFrame into a format where one or more - columns are identifier variables (id_vars) while all other columns, considered - measured variables (value_vars), are "unpivoted" to the row axis leaving just - two non-identifier columns, \'variable\' and \'value\'. - - Parameters - ---------- - id_vars - Column(s) or selector(s) to use as identifier variables. - value_vars - Column(s) or selector(s) to use as values variables; if `value_vars` - is empty all columns that are not in `id_vars` will be used. - variable_name - Name to give to the `variable` column. Defaults to "variable" - value_name - Name to give to the `value` column. Defaults to "value" - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["x", "y", "z"], - ... "b": [1, 3, 5], - ... "c": [2, 4, 6], - ... } - ... ) - >>> import polars.selectors as cs - >>> df.melt(id_vars="a", value_vars=cs.numeric()) - shape: (6, 3) - ┌─────┬──────────┬───────┐ - │ a ┆ variable ┆ value │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 │ - ╞═════╪══════════╪═══════╡ - │ x ┆ b ┆ 1 │ - │ y ┆ b ┆ 3 │ - │ z ┆ b ┆ 5 │ - │ x ┆ c ┆ 2 │ - │ y ┆ c ┆ 4 │ - │ z ┆ c ┆ 6 │ - └─────┴──────────┴───────┘ - - ''' - def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: - ''' - Unstack a long table to a wide form without doing an aggregation. - - This can be much faster than a pivot, because it can skip the grouping phase. - - Warnings - -------- - This functionality is experimental and may be subject to changes - without it being considered a breaking change. - - Parameters - ---------- - step - Number of rows in the unstacked frame. - how : { \'vertical\', \'horizontal\' } - Direction of the unstack. - columns - Column name(s) or selector(s) to include in the operation. - If set to `None` (default), use all columns. - fill_values - Fill values that don\'t fit the new size with this value. - - Examples - -------- - >>> from string import ascii_uppercase - >>> df = pl.DataFrame( - ... { - ... "x": list(ascii_uppercase[0:8]), - ... "y": pl.int_range(1, 9, eager=True), - ... } - ... ).with_columns( - ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), - ... ) - >>> df - shape: (8, 3) - ┌─────┬─────┬──────────┐ - │ x ┆ y ┆ z │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ list[u8] │ - ╞═════╪═════╪══════════╡ - │ A ┆ 1 ┆ [1, 2] │ - │ B ┆ 2 ┆ [2, 3] │ - │ C ┆ 3 ┆ [3, 4] │ - │ D ┆ 4 ┆ [4, 5] │ - │ E ┆ 5 ┆ [5, 6] │ - │ F ┆ 6 ┆ [6, 7] │ - │ G ┆ 7 ┆ [7, 8] │ - │ H ┆ 8 ┆ [8, 9] │ - └─────┴─────┴──────────┘ - >>> df.unstack(step=4, how="vertical") - shape: (4, 6) - ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ - │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ - ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ - │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ - │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ - │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ - │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ - └─────┴─────┴─────┴─────┴──────────┴──────────┘ - >>> df.unstack(step=2, how="horizontal") - shape: (4, 6) - ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ - │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ - ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ - │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ - │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ - │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ - │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ - └─────┴─────┴─────┴─────┴──────────┴──────────┘ - >>> import polars.selectors as cs - >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) - shape: (5, 2) - ┌─────┬─────┐ - │ y_0 ┆ y_1 │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 6 │ - │ 2 ┆ 7 │ - │ 3 ┆ 8 │ - │ 4 ┆ 0 │ - │ 5 ┆ 0 │ - └─────┴─────┘ - - ''' - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: - ''' - Group by the given columns and return the groups as separate dataframes. - - Parameters - ---------- - by - Column name(s) or selector(s) to group by. - *more_by - Additional names of columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default partition by operation. - include_key - Include the columns used to partition the DataFrame in the output. - as_dict - Return a dictionary instead of a list. The dictionary keys are the distinct - group values that identify that group. - - Examples - -------- - Pass a single column name to partition by that column. - - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "c"], - ... "b": [1, 2, 1, 3, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> df.partition_by("a") # doctest: +IGNORE_RESULT - [shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘, - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘] - - Partition by multiple columns by either passing a list of column names, or by - specifying each column name as a positional argument. - - >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT - [shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘] - - Return the partitions as a dictionary by specifying `as_dict=True`. - - >>> import polars.selectors as cs - >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT - {\'a\': shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘, - \'b\': shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘, - \'c\': shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘} - - ''' - def shift(self, n: int = ...) -> DataFrame: - ''' - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. Accepts expression input. - Non-expression inputs are parsed as literals. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [5, 6, 7, 8], - ... } - ... ) - >>> df.shift() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ null ┆ null │ - │ 1 ┆ 5 │ - │ 2 ┆ 6 │ - │ 3 ┆ 7 │ - └──────┴──────┘ - - Pass a negative value to shift in the opposite direction instead. - - >>> df.shift(-2) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ null ┆ null │ - │ null ┆ null │ - └──────┴──────┘ - - Specify `fill_value` to fill the resulting null values. - - >>> df.shift(-2, fill_value=100) - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ 100 ┆ 100 │ - │ 100 ┆ 100 │ - └─────┴─────┘ - - ''' - def is_duplicated(self) -> Series: - ''' - Get a mask of all duplicated rows in this DataFrame. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) - >>> df.is_duplicated() - shape: (4,) - Series: \'\' [bool] - [ - true - false - false - true - ] - - This mask can be used to visualize the duplicated lines like this: - - >>> df.filter(df.is_duplicated()) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ str │ - ╞═════╪═════╡ - │ 1 ┆ x │ - │ 1 ┆ x │ - └─────┴─────┘ - ''' - def is_unique(self) -> Series: - ''' - Get a mask of all unique rows in this DataFrame. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) - >>> df.is_unique() - shape: (4,) - Series: \'\' [bool] - [ - false - true - true - false - ] - - This mask can be used to visualize the unique lines like this: - - >>> df.filter(df.is_unique()) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ str │ - ╞═════╪═════╡ - │ 2 ┆ y │ - │ 3 ┆ z │ - └─────┴─────┘ - ''' - def lazy(self) -> LazyFrame: - ''' - Start a lazy query from this point. This returns a `LazyFrame` object. - - Operations on a `LazyFrame` are not executed until this is requested by either - calling: - - * :meth:`.fetch() ` - (run on a small number of rows) - * :meth:`.collect() ` - (run on all data) - * :meth:`.describe_plan() ` - (print unoptimized query plan) - * :meth:`.describe_optimized_plan() ` - (print optimized query plan) - * :meth:`.show_graph() ` - (show (un)optimized query plan as graphviz graph) - - Lazy operations are advised because they allow for query optimization and more - parallelization. - - Returns - ------- - LazyFrame - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> df.lazy() # doctest: +ELLIPSIS - - - ''' - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - ''' - Select columns from this DataFrame. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Examples - -------- - Pass the name of a column to select that column. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.select("foo") - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - Multiple columns can be selected by passing a list of column names. - - >>> df.select(["foo", "bar"]) - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 6 │ - │ 2 ┆ 7 │ - │ 3 ┆ 8 │ - └─────┴─────┘ - - Multiple columns can also be selected using positional arguments instead of a - list. Expressions are also accepted. - - >>> df.select(pl.col("foo"), pl.col("bar") + 1) - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - └─────┴─────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) - shape: (3, 1) - ┌───────────┐ - │ threshold │ - │ --- │ - │ i32 │ - ╞═══════════╡ - │ 0 │ - │ 0 │ - │ 10 │ - └───────────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... df.select( - ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), - ... ) - ... - shape: (3, 1) - ┌───────────┐ - │ is_odd │ - │ --- │ - │ struct[2] │ - ╞═══════════╡ - │ {1,0} │ - │ {0,1} │ - │ {1,0} │ - └───────────┘ - - ''' - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - """ - Select columns from this LazyFrame. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - See Also - -------- - select - - """ - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - ''' - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - DataFrame - A new DataFrame with the columns added. - - Notes - ----- - Creating a new DataFrame using this method does not create a new copy of - existing data. - - Examples - -------- - Pass an expression to add it as a new column. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) - shape: (4, 4) - ┌─────┬──────┬───────┬──────┐ - │ a ┆ b ┆ c ┆ a^2 │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 │ - ╞═════╪══════╪═══════╪══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ - └─────┴──────┴───────┴──────┘ - - Added columns will replace existing columns with the same name. - - >>> df.with_columns(pl.col("a").cast(pl.Float64)) - shape: (4, 3) - ┌─────┬──────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╡ - │ 1.0 ┆ 0.5 ┆ true │ - │ 2.0 ┆ 4.0 ┆ true │ - │ 3.0 ┆ 10.0 ┆ false │ - │ 4.0 ┆ 13.0 ┆ true │ - └─────┴──────┴───────┘ - - Multiple columns can be added by passing a list of expressions. - - >>> df.with_columns( - ... [ - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ] - ... ) - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Multiple columns also can be added using positional arguments instead of a list. - - >>> df.with_columns( - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ) - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> df.with_columns( - ... ab=pl.col("a") * pl.col("b"), - ... not_c=pl.col("c").not_(), - ... ) - shape: (4, 5) - ┌─────┬──────┬───────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ ab ┆ not_c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ - └─────┴──────┴───────┴──────┴───────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... df.drop("c").with_columns( - ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), - ... ) - ... - shape: (4, 3) - ┌─────┬──────┬─────────────┐ - │ a ┆ b ┆ diffs │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ struct[2] │ - ╞═════╪══════╪═════════════╡ - │ 1 ┆ 0.5 ┆ {null,null} │ - │ 2 ┆ 4.0 ┆ {1,3.5} │ - │ 3 ┆ 10.0 ┆ {1,6.0} │ - │ 4 ┆ 13.0 ┆ {1,3.0} │ - └─────┴──────┴─────────────┘ - - ''' - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - """ - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - LazyFrame - A new LazyFrame with the columns added. - - See Also - -------- - with_columns - - """ - def n_chunks(self, strategy: str = ...) -> int | list[int]: - ''' - Get number of chunks used by the ChunkedArrays of this DataFrame. - - Parameters - ---------- - strategy : {\'first\', \'all\'} - Return the number of chunks of the \'first\' column, - or \'all\' columns in this DataFrame. - - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.n_chunks() - 1 - >>> df.n_chunks(strategy="all") - [1, 1, 1] - - ''' - def max(self, axis: int | None = ...) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their maximum value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`max_horizontal`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.max() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def max_horizontal(self) -> Series: - ''' - Get the maximum value horizontally across columns. - - Returns - ------- - Series - A Series named `"max"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.max_horizontal() - shape: (3,) - Series: \'max\' [f64] - [ - 4.0 - 5.0 - 6.0 - ] - ''' - def min(self, axis: int | None = ...) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their minimum value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`min_horizontal`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.min() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - ''' - def min_horizontal(self) -> Series: - ''' - Get the minimum value horizontally across columns. - - Returns - ------- - Series - A Series named `"min"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.min_horizontal() - shape: (3,) - Series: \'min\' [f64] - [ - 1.0 - 2.0 - 3.0 - ] - ''' - def sum(self) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their sum value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`sum_horizontal`. - null_strategy : {\'ignore\', \'propagate\'} - This argument is only used if `axis == 1`. - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.sum() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 6 ┆ 21 ┆ null │ - └─────┴─────┴──────┘ - ''' - def sum_horizontal(self) -> Series: - ''' - Sum all values horizontally across columns. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - If set to `False`, any null value in the input will lead to a null output. - - Returns - ------- - Series - A Series named `"sum"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.sum_horizontal() - shape: (3,) - Series: \'sum\' [f64] - [ - 5.0 - 7.0 - 9.0 - ] - ''' - def mean(self) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their mean value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`mean_horizontal`. - null_strategy : {\'ignore\', \'propagate\'} - This argument is only used if `axis == 1`. - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... "spam": [True, False, None], - ... } - ... ) - >>> df.mean() - shape: (1, 4) - ┌─────┬─────┬──────┬──────┐ - │ foo ┆ bar ┆ ham ┆ spam │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str ┆ f64 │ - ╞═════╪═════╪══════╪══════╡ - │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ - └─────┴─────┴──────┴──────┘ - ''' - def mean_horizontal(self) -> Series: - ''' - Take the mean of all values horizontally across columns. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - If set to `False`, any null value in the input will lead to a null output. - - Returns - ------- - Series - A Series named `"mean"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.mean_horizontal() - shape: (3,) - Series: \'mean\' [f64] - [ - 2.5 - 3.5 - 4.5 - ] - ''' - def std(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns of this DataFrame to their standard deviation value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.std() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1.0 ┆ 1.0 ┆ null │ - └─────┴─────┴──────┘ - >>> df.std(ddof=0) - shape: (1, 3) - ┌──────────┬──────────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞══════════╪══════════╪══════╡ - │ 0.816497 ┆ 0.816497 ┆ null │ - └──────────┴──────────┴──────┘ - - ''' - def var(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns of this DataFrame to their variance value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.var() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1.0 ┆ 1.0 ┆ null │ - └─────┴─────┴──────┘ - >>> df.var(ddof=0) - shape: (1, 3) - ┌──────────┬──────────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞══════════╪══════════╪══════╡ - │ 0.666667 ┆ 0.666667 ┆ null │ - └──────────┴──────────┴──────┘ - - ''' - def median(self) -> Self: - ''' - Aggregate the columns of this DataFrame to their median value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.median() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 2.0 ┆ 7.0 ┆ null │ - └─────┴─────┴──────┘ - - ''' - def product(self) -> DataFrame: - ''' - Aggregate the columns of this DataFrame to their product values. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": [0.5, 4, 10], - ... "c": [True, True, False], - ... } - ... ) - - >>> df.product() - shape: (1, 3) - ┌─────┬──────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ i64 │ - ╞═════╪══════╪═════╡ - │ 6 ┆ 20.0 ┆ 0 │ - └─────┴──────┴─────┘ - - ''' - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: - ''' - Aggregate the columns of this DataFrame to their quantile value. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.quantile(0.5, "nearest") - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 2.0 ┆ 7.0 ┆ null │ - └─────┴─────┴──────┘ - - ''' - def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: - ''' - Convert categorical variables into dummy/indicator variables. - - Parameters - ---------- - columns - Column name(s) or selector(s) that should be converted to dummy - variables. If set to `None` (default), convert all columns. - separator - Separator/delimiter used when generating column names. - drop_first - Remove the first category from the variables being encoded. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2], - ... "bar": [3, 4], - ... "ham": ["a", "b"], - ... } - ... ) - >>> df.to_dummies() - shape: (2, 6) - ┌───────┬───────┬───────┬───────┬───────┬───────┐ - │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ - ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ - │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ - │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ - └───────┴───────┴───────┴───────┴───────┴───────┘ - - >>> df.to_dummies(drop_first=True) - shape: (2, 3) - ┌───────┬───────┬───────┐ - │ foo_2 ┆ bar_4 ┆ ham_b │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 │ - ╞═══════╪═══════╪═══════╡ - │ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1 ┆ 1 │ - └───────┴───────┴───────┘ - - >>> import polars.selectors as cs - >>> df.to_dummies(cs.integer(), separator=":") - shape: (2, 5) - ┌───────┬───────┬───────┬───────┬─────┐ - │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ - ╞═══════╪═══════╪═══════╪═══════╪═════╡ - │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ - │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ - └───────┴───────┴───────┴───────┴─────┘ - - >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") - shape: (2, 3) - ┌───────┬───────┬─────┐ - │ foo:2 ┆ bar:4 ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ str │ - ╞═══════╪═══════╪═════╡ - │ 0 ┆ 0 ┆ a │ - │ 1 ┆ 1 ┆ b │ - └───────┴───────┴─────┘ - - ''' - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: - ''' - Drop duplicate rows from this dataframe. - - Parameters - ---------- - subset - Column name(s) or selector(s), to consider when identifying - duplicate rows. If set to `None` (default), use all columns. - keep : {\'first\', \'last\', \'any\', \'none\'} - Which of the duplicate rows to keep. - - * \'any\': Does not give any guarantee of which row is kept. - This allows more optimizations. - * \'none\': Don\'t keep duplicate rows. - * \'first\': Keep first unique row. - * \'last\': Keep last unique row. - maintain_order - Keep the same order as the original DataFrame. This is more expensive to - compute. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - Returns - ------- - DataFrame - DataFrame with unique rows. - - Warnings - -------- - This method will fail if there is a column of type `List` in the DataFrame or - subset. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 1], - ... "bar": ["a", "a", "a", "a"], - ... "ham": ["b", "b", "b", "b"], - ... } - ... ) - >>> df.unique(maintain_order=True) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> df.unique(subset=["bar", "ham"], maintain_order=True) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> df.unique(keep="last", maintain_order=True) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - - ''' - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: - ''' - Return the number of unique rows, or the number of unique row-subsets. - - Parameters - ---------- - subset - One or more columns/expressions that define what to count; - omit to return the count of unique rows. - - Notes - ----- - This method operates at the `DataFrame` level; to operate on subsets at the - expression level you can make use of struct-packing instead, for example: - - >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() - - If instead you want to count the number of unique values per-column, you can - also use expression-level syntax to return a new frame containing that result: - - >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) - >>> df_nunique = df.select(pl.all().n_unique()) - - In aggregate context there is also an equivalent method for returning the - unique values per-group: - - >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 1, 2, 3, 4, 5], - ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], - ... "c": [True, True, True, False, True, True], - ... } - ... ) - >>> df.n_unique() - 5 - - Simple columns subset. - - >>> df.n_unique(subset=["b", "c"]) - 4 - - Expression subset. - - >>> df.n_unique( - ... subset=[ - ... (pl.col("a") // 2), - ... (pl.col("c") | (pl.col("b") >= 2)), - ... ], - ... ) - 3 - - ''' - def approx_n_unique(self) -> DataFrame: - ''' - Approximate count of unique values. - - This is done using the HyperLogLog++ algorithm for cardinality estimation. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> df.approx_n_unique() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def approx_unique(self) -> DataFrame: - """ - Approximate count of unique values. - - .. deprecated:: 0.18.12 - This method has been renamed to :func:`DataFrame.approx_n_unique`. - - """ - def rechunk(self) -> Self: - """ - Rechunk the data in this DataFrame to a contiguous allocation. - - This will make sure all subsequent operations have optimal and predictable - performance. - """ - def null_count(self) -> Self: - ''' - Create a new DataFrame that shows the null counts per column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, None, 3], - ... "bar": [6, 7, None], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.null_count() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ u32 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 1 ┆ 0 │ - └─────┴─────┴─────┘ - - ''' - def sample(self, n: int | Series | None = ...) -> Self: - ''' - Sample from this DataFrame. - - Parameters - ---------- - n - Number of items to return. Cannot be used with `fraction`. Defaults to 1 if - `fraction` is None. - fraction - Fraction of items to return. Cannot be used with `n`. - with_replacement - Allow values to be sampled more than once. - shuffle - If set to True, the order of the sampled rows will be shuffled. If - set to False (default), the order of the returned rows will be - neither stable nor fully random. - seed - Seed for the random number generator. If set to None (default), a - random seed is generated for each sample operation. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8 ┆ c │ - │ 2 ┆ 7 ┆ b │ - └─────┴─────┴─────┘ - - ''' - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: - ''' - Apply a horizontal reduction on a DataFrame. - - This can be used to effectively determine aggregations on a row level, and can - be applied to any DataType that can be supercasted (casted to a similar parent - type). - - An example of the supercast rules when applying an arithmetic operation on two - DataTypes are for instance: - - - Int8 + Utf8 = Utf8 - - Float32 + Int64 = Float32 - - Float32 + Float64 = Float64 - - Examples - -------- - A horizontal sum operation: - - >>> df = pl.DataFrame( - ... { - ... "a": [2, 1, 3], - ... "b": [1, 2, 3], - ... "c": [1.0, 2.0, 3.0], - ... } - ... ) - >>> df.fold(lambda s1, s2: s1 + s2) - shape: (3,) - Series: \'a\' [f64] - [ - 4.0 - 5.0 - 9.0 - ] - - A horizontal minimum operation: - - >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) - >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) - shape: (3,) - Series: \'a\' [f64] - [ - 1.0 - 1.0 - 3.0 - ] - - A horizontal string concatenation: - - >>> df = pl.DataFrame( - ... { - ... "a": ["foo", "bar", 2], - ... "b": [1, 2, 3], - ... "c": [1.0, 2.0, 3.0], - ... } - ... ) - >>> df.fold(lambda s1, s2: s1 + s2) - shape: (3,) - Series: \'a\' [str] - [ - "foo11.0" - "bar22.0" - null - ] - - A horizontal boolean or, similar to a row-wise .any(): - - >>> df = pl.DataFrame( - ... { - ... "a": [False, False, True], - ... "b": [False, True, False], - ... } - ... ) - >>> df.fold(lambda s1, s2: s1 | s2) - shape: (3,) - Series: \'a\' [bool] - [ - false - true - true - ] - - Parameters - ---------- - operation - function that takes two `Series` and returns a `Series`. - - ''' - def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: - ''' - Get the values of a single row, either by index or by predicate. - - Parameters - ---------- - index - Row index. - by_predicate - Select the row according to a given expression/predicate. - named - Return a dictionary instead of a tuple. The dictionary is a mapping of - column name to row value. This is more expensive than returning a regular - tuple, but allows for accessing values by column name. - - Returns - ------- - tuple (default) or dictionary of row values - - Notes - ----- - The `index` and `by_predicate` params are mutually exclusive. Additionally, - to ensure clarity, the `by_predicate` parameter must be supplied by keyword. - - When using `by_predicate` it is an error condition if anything other than - one row is returned; more than one row raises `TooManyRowsReturnedError`, and - zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). - - Warnings - -------- - You should NEVER use this method to iterate over a DataFrame; if you require - row-iteration you should strongly prefer use of `iter_rows()` instead. - - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - rows : Materialise all frame data as a list of rows (potentially expensive). - item: Return dataframe element as a scalar. - - Examples - -------- - Specify an index to return the row at the given index as a tuple. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.row(2) - (3, 8, \'c\') - - Specify `named=True` to get a dictionary instead with a mapping of column - names to row values. - - >>> df.row(2, named=True) - {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} - - Use `by_predicate` to return the row that matches the given predicate. - - >>> df.row(by_predicate=(pl.col("ham") == "b")) - (2, 7, \'b\') - - ''' - def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: - ''' - Returns all data in the DataFrame as a list of rows of python-native values. - - Parameters - ---------- - named - Return dictionaries instead of tuples. The dictionaries are a mapping of - column name to row value. This is more expensive than returning a regular - tuple, but allows for accessing values by column name. - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - Warnings - -------- - Row-iteration is not optimal as the underlying data is stored in columnar form; - where possible, prefer export via one of the dedicated export/output methods. - Where possible you should also consider using `iter_rows` instead to avoid - materialising all the data at once. - - Returns - ------- - list of tuples (default) or dictionaries of row values - - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - rows_by_key : Materialises frame data as a key-indexed dictionary. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "x": ["a", "b", "b", "a"], - ... "y": [1, 2, 3, 4], - ... "z": [0, 3, 6, 9], - ... } - ... ) - >>> df.rows() - [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] - >>> df.rows(named=True) - [{\'x\': \'a\', \'y\': 1, \'z\': 0}, - {\'x\': \'b\', \'y\': 2, \'z\': 3}, - {\'x\': \'b\', \'y\': 3, \'z\': 6}, - {\'x\': \'a\', \'y\': 4, \'z\': 9}] - - ''' - def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: - ''' - Returns DataFrame data as a keyed dictionary of python-native values. - - Note that this method should not be used in place of native operations, due to - the high cost of materialising all frame data out into a dictionary; it should - be used only when you need to move the values out into a Python data structure - or other object that cannot operate directly with Polars/Arrow. - - Parameters - ---------- - key - The column(s) to use as the key for the returned dictionary. If multiple - columns are specified, the key will be a tuple of those values, otherwise - it will be a string. - named - Return dictionary rows instead of tuples, mapping column name to row value. - include_key - Include key values inline with the associated data (by default the key - values are omitted as a memory/performance optimisation, as they can be - reoconstructed from the key). - unique - Indicate that the key is unique; this will result in a 1:1 mapping from - key to a single associated row. Note that if the key is *not* actually - unique the last row with the given key will be returned. - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - See Also - -------- - rows : Materialise all frame data as a list of rows (potentially expensive). - iter_rows : Row iterator over frame data (does not materialise all rows). - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "w": ["a", "b", "b", "a"], - ... "x": ["q", "q", "q", "k"], - ... "y": [1.0, 2.5, 3.0, 4.5], - ... "z": [9, 8, 7, 6], - ... } - ... ) - - Group rows by the given key column(s): - - >>> df.rows_by_key(key=["w"]) - defaultdict(, - {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], - \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) - - Return the same row groupings as dictionaries: - - >>> df.rows_by_key(key=["w"], named=True) - defaultdict(, - {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, - {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], - \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, - {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) - - Return row groupings, assuming keys are unique: - - >>> df.rows_by_key(key=["z"], unique=True) - {9: (\'a\', \'q\', 1.0), - 8: (\'b\', \'q\', 2.5), - 7: (\'b\', \'q\', 3.0), - 6: (\'a\', \'k\', 4.5)} - - Return row groupings as dictionaries, assuming keys are unique: - - >>> df.rows_by_key(key=["z"], named=True, unique=True) - {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, - 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, - 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, - 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} - - Return dictionary rows grouped by a compound key, including key values: - - >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) - defaultdict(, - {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], - (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, - {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], - (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) - - ''' - def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: - ''' - Returns an iterator over the DataFrame of rows of python-native values. - - Parameters - ---------- - named - Return dictionaries instead of tuples. The dictionaries are a mapping of - column name to row value. This is more expensive than returning a regular - tuple, but allows for accessing values by column name. - buffer_size - Determines the number of rows that are buffered internally while iterating - over the data; you should only modify this in very specific cases where the - default value is determined not to be a good fit to your access pattern, as - the speedup from using the buffer is significant (~2-4x). Setting this - value to zero disables row buffering (not recommended). - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - Warnings - -------- - Row iteration is not optimal as the underlying data is stored in columnar form; - where possible, prefer export via one of the dedicated export/output methods - that deals with columnar data. - - Returns - ------- - iterator of tuples (default) or dictionaries (if named) of python row values - - See Also - -------- - rows : Materialises all frame data as a list of rows (potentially expensive). - rows_by_key : Materialises frame data as a key-indexed dictionary. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> [row[0] for row in df.iter_rows()] - [1, 3, 5] - >>> [row["b"] for row in df.iter_rows(named=True)] - [2, 4, 6] - - ''' - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: - ''' - Returns a non-copying iterator of slices over the underlying DataFrame. - - Parameters - ---------- - n_rows - Determines the number of rows contained in each DataFrame slice. - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... data={ - ... "a": range(17_500), - ... "b": date(2023, 1, 1), - ... "c": "klmnoopqrstuvwxyz", - ... }, - ... schema_overrides={"a": pl.Int32}, - ... ) - >>> for idx, frame in enumerate(df.iter_slices()): - ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") - ... - DataFrame:[0]:10000 - DataFrame:[1]:7500 - - Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and - any supported frame export/conversion types; for example, as RecordBatches: - - >>> for frame in df.iter_slices(n_rows=15_000): - ... record_batch = frame.to_arrow().to_batches()[0] - ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") - ... - a: int32 - b: date32[day] - c: large_string - << 15000 - a: int32 - b: date32[day] - c: large_string - << 2500 - - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - partition_by : Split into multiple DataFrames, partitioned by groups. - - ''' - def shrink_to_fit(self) -> Self: - """ - Shrink DataFrame memory usage. - - Shrinks to fit the exact capacity needed to hold the data. - - """ - def gather_every(self, n: int) -> DataFrame: - ''' - Take every nth row in the DataFrame and return as a new DataFrame. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) - >>> s.gather_every(2) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 5 │ - │ 3 ┆ 7 │ - └─────┴─────┘ - - ''' - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: - ''' - Hash and combine the rows in this DataFrame. - - The hash value is of type `UInt64`. - - Parameters - ---------- - seed - Random seed parameter. Defaults to 0. - seed_1 - Random seed parameter. Defaults to `seed` if not set. - seed_2 - Random seed parameter. Defaults to `seed` if not set. - seed_3 - Random seed parameter. Defaults to `seed` if not set. - - Notes - ----- - This implementation of :func:`hash_rows` does not guarantee stable results - across different Polars versions. Its stability is only guaranteed within a - single version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, None, 3, 4], - ... "ham": ["a", "b", None, "d"], - ... } - ... ) - >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT - shape: (4,) - Series: \'\' [u64] - [ - 10783150408545073287 - 1438741209321515184 - 10047419486152048166 - 2047317070637311557 - ] - - ''' - def interpolate(self) -> DataFrame: - ''' - Interpolate intermediate values. The interpolation method is linear. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, None, 9, 10], - ... "bar": [6, 7, 9, None], - ... "baz": [1, None, None, 9], - ... } - ... ) - >>> df.interpolate() - shape: (4, 3) - ┌──────┬──────┬──────────┐ - │ foo ┆ bar ┆ baz │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 │ - ╞══════╪══════╪══════════╡ - │ 1.0 ┆ 6.0 ┆ 1.0 │ - │ 5.0 ┆ 7.0 ┆ 3.666667 │ - │ 9.0 ┆ 9.0 ┆ 6.333333 │ - │ 10.0 ┆ null ┆ 9.0 │ - └──────┴──────┴──────────┘ - - ''' - def is_empty(self) -> bool: - ''' - Check if the dataframe is empty. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.is_empty() - False - >>> df.filter(pl.col("foo") > 99).is_empty() - True - - ''' - def to_struct(self, name: str) -> Series: - ''' - Convert a `DataFrame` to a `Series` of type `Struct`. - - Parameters - ---------- - name - Name for the struct Series - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4, 5], - ... "b": ["one", "two", "three", "four", "five"], - ... } - ... ) - >>> df.to_struct("nums") - shape: (5,) - Series: \'nums\' [struct[2]] - [ - {1,"one"} - {2,"two"} - {3,"three"} - {4,"four"} - {5,"five"} - ] - - ''' - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: - ''' - Decompose struct columns into separate columns for each of their fields. - - The new columns will be inserted into the dataframe at the location of the - struct column. - - Parameters - ---------- - columns - Name of the struct column(s) that should be unnested. - *more_columns - Additional columns to unnest, specified as positional arguments. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "before": ["foo", "bar"], - ... "t_a": [1, 2], - ... "t_b": ["a", "b"], - ... "t_c": [True, None], - ... "t_d": [[1, 2], [3]], - ... "after": ["baz", "womp"], - ... } - ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") - >>> df - shape: (2, 3) - ┌────────┬─────────────────────┬───────┐ - │ before ┆ t_struct ┆ after │ - │ --- ┆ --- ┆ --- │ - │ str ┆ struct[4] ┆ str │ - ╞════════╪═════════════════════╪═══════╡ - │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ - │ bar ┆ {2,"b",null,[3]} ┆ womp │ - └────────┴─────────────────────┴───────┘ - >>> df.unnest("t_struct") - shape: (2, 6) - ┌────────┬─────┬─────┬──────┬───────────┬───────┐ - │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ - ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ - │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ - │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ - └────────┴─────┴─────┴──────┴───────────┴───────┘ - - ''' - def corr(self, **kwargs: Any) -> DataFrame: - ''' - Return pairwise Pearson product-moment correlation coefficients between columns. - - See numpy `corrcoef` for more information: - https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html - - Notes - ----- - This functionality requires numpy to be installed. - - Parameters - ---------- - **kwargs - Keyword arguments are passed to numpy `corrcoef`. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) - >>> df.corr() - shape: (3, 3) - ┌──────┬──────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 │ - ╞══════╪══════╪══════╡ - │ 1.0 ┆ -1.0 ┆ 1.0 │ - │ -1.0 ┆ 1.0 ┆ -1.0 │ - │ 1.0 ┆ -1.0 ┆ 1.0 │ - └──────┴──────┴──────┘ - - ''' - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: - ''' - Take two sorted DataFrames and merge them by the sorted key. - - The output of this operation will also be sorted. - It is the callers responsibility that the frames are sorted - by that key otherwise the output will not make sense. - - The schemas of both DataFrames must be equal. - - Parameters - ---------- - other - Other DataFrame that must be merged - key - Key that is sorted. - - Examples - -------- - >>> df0 = pl.DataFrame( - ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} - ... ).sort("age") - >>> df0 - shape: (3, 2) - ┌───────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═══════╪═════╡ - │ bob ┆ 18 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └───────┴─────┘ - >>> df1 = pl.DataFrame( - ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} - ... ).sort("age") - >>> df1 - shape: (4, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - └────────┴─────┘ - >>> df0.merge_sorted(df1, key="age") - shape: (7, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ bob ┆ 18 │ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └────────┴─────┘ - ''' - def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: - """ - Indicate that one or multiple columns are sorted. - - Parameters - ---------- - column - Columns that are sorted - more_columns - Additional columns that are sorted, specified as positional arguments. - descending - Whether the columns are sorted in descending order. - """ - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: - ''' - Update the values in this `DataFrame` with the values in `other`. - - By default, null values in the right dataframe are ignored. Use - `ignore_nulls=False` to overwrite values in this frame with null values in other - frame. - - Notes - ----- - This is syntactic sugar for a left/inner join, with an optional coalesce when - `include_nulls = False`. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Parameters - ---------- - other - DataFrame that will be used to update the values - on - Column names that will be joined on. - If none given the row count is used. - left_on - Join column(s) of the left DataFrame. - right_on - Join column(s) of the right DataFrame. - how : {\'left\', \'inner\', \'outer\'} - * \'left\' will keep all rows from the left table; rows may be duplicated - if multiple rows in the right frame match the left row\'s key. - * \'inner\' keeps only those rows where the key exists in both frames. - * \'outer\' will update existing rows where the key matches while also - adding any new rows contained in the given frame. - include_nulls - If True, null values from the right dataframe will be used to update the - left dataframe. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4], - ... "B": [400, 500, 600, 700], - ... } - ... ) - >>> df - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 400 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - >>> new_df = pl.DataFrame( - ... { - ... "B": [-66, None, -99], - ... "C": [5, 3, 1], - ... } - ... ) - - Update `df` values with the non-null values in `new_df`, by row index: - - >>> df.update(new_df) - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, by row index, - but only keeping those rows that are common to both frames: - - >>> df.update(new_df, how="inner") - shape: (3, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") - shape: (5, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴─────┘ - - Update `df` values including null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> df.update( - ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True - ... ) - shape: (5, 2) - ┌─────┬──────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ null │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴──────┘ - - ''' - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: - """ - Start a group by operation. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.group_by`. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - .. note:: - Within each group, the order of rows is always preserved, regardless - of this argument. - - Returns - ------- - GroupBy - Object which can be used to perform aggregations. - - """ - def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - """ - def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.9 - This method has been renamed to :func:`DataFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - """ - def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.group_by_dynamic`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - include_boundaries - Add the lower and upper bound of the window to the "_lower_bound" and - "_upper_bound" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - DynamicGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - ''' - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: - """ - Apply a custom/user-defined function (UDF) over the rows of the DataFrame. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.map_rows`. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output type of the operation. If none given, Polars tries to infer the type. - inference_size - Only used in the case when the custom function returns rows. - This uses the first `n` rows to determine the output schema - - """ - def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - fill None values with this value. - n - Number of places to shift (may be negative). - - """ - def take_every(self, n: int) -> DataFrame: - """ - Take every nth row in the DataFrame and return as a new DataFrame. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - def find_idx_by_name(self, name: str) -> int: - """ - Find the index of a column by name. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`get_column_index`. - - Parameters - ---------- - name - Name of the column to find. - """ - def insert_at_idx(self, index: int, column: Series) -> Self: - """ - Insert a Series at a certain column index. This operation is in place. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`insert_column`. - - Parameters - ---------- - index - Column to insert the new `Series` column. - column - `Series` to insert. - """ - def replace_at_idx(self, index: int, new_column: Series) -> Self: - """ - Replace a column at an index location. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`replace_column`. - - Parameters - ---------- - index - Column index. - new_column - Series that will replace the column. - """ - def frame_equal(self, other: DataFrame) -> bool: - """ - Check whether the DataFrame is equal to another DataFrame. - - .. deprecated:: 0.19.16 - This method has been renamed to :func:`equals`. - - Parameters - ---------- - other - DataFrame to compare with. - null_equal - Consider null values as equal. - """ - @property - def shape(self): ... - @property - def height(self): ... - @property - def width(self): ... - @property - def dtypes(self): ... - @property - def flags(self): ... - @property - def schema(self): ... -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/dataframe/frame.pyi new file mode 100644 index 0000000..fa46951 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/dataframe/frame.pyi @@ -0,0 +1,7092 @@ +#: version 0.20.2 +import P +import deltalake +import deltalake.table +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Enum as Enum, Float64 as Float64, Null as Null, Object as Object, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import ModuleUpgradeRequired as ModuleUpgradeRequired, NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, _warn_null_comparison as _warn_null_comparison, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_PANDAS_AVAILABLE: bool +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use `pl.read_csv` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use `pl.read_parquet` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading `n_rows`. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use `pl.read_json` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use `pl.read_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with `NaN`. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to `True` will raise a `NotImplementedError`. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars DataFrame to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the DataFrame as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to `df[0,0]`, with a check that + the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are Series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + `structured` is set to `False` and the DataFrame dtypes allow for a + global dtype for all columns. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + function for the conversion to numpy if necessary. + + Notes + ----- + If you\'re attempting to convert Utf8 or Decimal to an array, you\'ll need to + install `pyarrow`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Utf8), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Utf8), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + separator or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path or writeable file-like object to which the data will be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + name + Schema name. Defaults to empty string. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open `xlsxwriter.Workbook` object that has not been closed. + If None, writes to a `dataframe.xlsx` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of `{"key":value,}` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. + column_formats : dict + A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. + dtype_formats : dict + A `{dtype:str,}` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + `column_formats` param). It is also valid to use dtype groups such as + `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid `xlsxwriter` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all `xlsxwriter` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A `{key:value,}` dictionary of `xlsxwriter` format options to apply + to the table header row, such as `{"bold":True, "font_color":"#702963"}`. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a `{colname:funcname,}` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A `{colname:int,}` or `{selector:int,}` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a `{colname:columns,}` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or `{row_index:int,}` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that `row_index` starts at zero and will be + the header row (unless `include_header` is False). + sparklines : dict + A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an `xlsxwriter`-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + include_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible `xlsxwriter` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic DataFrame: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + ... + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC data will be + written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC record batch data will + be written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + data_page_size + Size of the data page in bytes. Defaults to 1024^2 bytes. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to `pyarrow.parquet.write_table`. + + If you pass `partition_cols` here, the dataset will be written + using `pyarrow.parquet.write_to_dataset`. + The `partition_cols` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, table_name: str, connection: str) -> int: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Schema-qualified name of the table to create or append to in the target + SQL database. If your table name contains special characters, it should + be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_table_exists : {\'append\', \'replace\', \'fail\'} + The insert mode: + + * \'replace\' will create a new database table, overwriting an existing one. + * \'append\' will append to an existing table. + * \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine to use for writing frame data. + + Returns + ------- + int + The number of rows affected, if the driver provides this information. + Otherwise, returns -1. + + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> deltalake.table.TableMerger | None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\', \'merge\'} + How to handle existing data. + + - If \'error\', throw an error if the table already exists (default). + - If \'append\', will add new data. + - If \'overwrite\', will replace table with new data. + - If \'ignore\', will not write anything if table already exists. + - If \'merge\', return a `TableMerger` object to merge data from the DataFrame + with the existing data. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + - See a list of supported storage options for S3 `here `__. + - See a list of supported storage options for GCS `here `__. + - See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + delta_merge_options + Keyword arguments which are required to `MERGE` a Delta lake Table. + See a list of supported merge options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + TableNotFoundError + If the delta table doesn\'t exist and MERGE action is triggered + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a DataFrame as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + Merge the DataFrame with an existing Delta Lake table. + For all `TableMerger` methods, check the deltalake docs + `here `__. + + Schema evolution is not yet supported in by the `deltalake` package, therefore + `overwrite_schema` will not have any effect on a merge operation. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> ( + ... df.write_delta( + ... "table_path", + ... mode="merge", + ... delta_merge_options={ + ... "predicate": "s.foo = t.foo", + ... "source_alias": "s", + ... "target_alias": "t", + ... }, + ... ) + ... .when_matched_update_all() + ... .when_not_matched_insert_all() + ... .execute() + ... ) # doctest: +SKIP + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + ... + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_column(self, index: int, column: Series) -> Self: + ''' + Insert a Series at a certain column index. + + This operation is in place. + + Parameters + ---------- + index + Index at which to insert the new `Series` column. + column + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_column(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_column(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: + ''' + Filter the rows in the DataFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") > 1) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions, combined with and/or operators: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> df.filter( + ... pl.col("foo") <= 2, + ... ~pl.col("ham").is_in(["b", "c"]), + ... ) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> df.filter(foo=2, ham="b") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Warnings + -------- + We will never guarantee the output of describe to be stable. + It will show statistics that we deem informative and may + be updated in the future. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "float": [1.0, 2.8, 3.0], + ... "int": [4, 5, None], + ... "bool": [True, False, True], + ... "str": [None, "b", "c"], + ... "str2": ["usd", "eur", None], + ... "date": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬───────┬──────┬──────┬────────────┐ + │ describe ┆ float ┆ int ┆ bool ┆ str ┆ str2 ┆ date │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ str ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪═══════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3 ┆ 2 ┆ 2 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ null ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ False ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 2.8 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ True ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴───────┴──────┴──────┴────────────┘ + + ''' + def get_column_index(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.get_column_index("ham") + 2 + + ''' + def replace_column(self, index: int, column: Series) -> Self: + ''' + Replace a column at an index location. + + This operation is in place. + + Parameters + ---------- + index + Column index. + column + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_column(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def equals(self, other: DataFrame) -> bool: + ''' + Check whether the DataFrame is equal to another DataFrame. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + See Also + -------- + assert_frame_equal + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.equals(df1) + True + >>> df1.equals(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The `GroupBy` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + ... + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `group_by_dynamic` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling operation on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\', \'outer_coalesce\'} + Join strategy. + + * *inner* + Returns rows that have matching values in both tables + * *left* + Returns all rows from the left table, and the matched rows from the + right table + * *outer* + Returns all rows when there is a match in either left or right table + * *outer_coalesce* + Same as \'outer\', but coalesces the key columns + * *cross* + Returns the cartisian product of rows from both tables + * *semi* + Filter rows that have a match in the right table. + * *anti* + Filter rows that not have a match in the right table. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + join_nulls + Join on null values. By default null values will never produce matches. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 5) + ┌──────┬──────┬──────┬───────┬───────────┐ + │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞══════╪══════╪══════╪═══════╪═══════════╡ + │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │ + │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │ + │ null ┆ null ┆ null ┆ z ┆ d │ + │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │ + └──────┴──────┴──────┴───────┴───────────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see `pl.StringCache()`. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: `udf(row)`. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level `apply` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level `apply` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, other: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of + this `DataFrame`, `extend` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer `vstack` over `extend` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single `DataFrame`. In the latter case, finish the sequence of + `vstack` operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.Utf8).to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this DataFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Returns + ------- + Series + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill `value`. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> df.melt(id_vars="a", value_vars=cs.numeric()) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to `None` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying `as_dict=True`. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, n: int = ...) -> DataFrame: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> df.shift() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.shift(-2) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.shift(-2, fill_value=100) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ) + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ) + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`max_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def max_horizontal(self) -> Series: + ''' + Get the maximum value horizontally across columns. + + Returns + ------- + Series + A Series named `"max"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.max_horizontal() + shape: (3,) + Series: \'max\' [f64] + [ + 4.0 + 5.0 + 6.0 + ] + ''' + def min(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`min_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def min_horizontal(self) -> Series: + ''' + Get the minimum value horizontally across columns. + + Returns + ------- + Series + A Series named `"min"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.min_horizontal() + shape: (3,) + Series: \'min\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`sum_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + ''' + def sum_horizontal(self) -> Series: + ''' + Sum all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"sum"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.sum_horizontal() + shape: (3,) + Series: \'sum\' [f64] + [ + 5.0 + 7.0 + 9.0 + ] + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`mean_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + ''' + def mean_horizontal(self) -> Series: + ''' + Take the mean of all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"mean"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.mean_horizontal() + shape: (3,) + Series: \'mean\' [f64] + [ + 2.5 + 3.5 + 4.5 + ] + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to `None` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the `DataFrame` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + - Int8 + Utf8 = Utf8 + - Float32 + Int64 = Float32 + - Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The `index` and `by_predicate` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using `by_predicate` it is an error condition if anything other than + one row is returned; more than one row raises `TooManyRowsReturnedError`, and + zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of `iter_rows()` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify `named=True` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use `by_predicate` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using `iter_rows` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_columns(self) -> Iterator[Series]: + ''' + Returns an iterator over the DataFrame\'s columns. + + Notes + ----- + Consider whether you can use :func:`all` instead. + If you can, it will be more efficient. + + Returns + ------- + Iterator of Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [s.name for s in df.iter_columns()] + [\'a\', \'b\'] + + If you\'re using this to modify a dataframe\'s columns, e.g. + + >>> # Do NOT do this + >>> pl.DataFrame(column * 2 for column in df.iter_columns()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + + then consider whether you can use :func:`all` instead: + + >>> df.select(pl.all() * 2) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + ... + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + ... + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def gather_every(self, n: int) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.gather_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash_rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str = ...) -> Series: + ''' + Convert a `DataFrame` to a `Series` of type `Struct`. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy `corrcoef` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy `corrcoef`. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the values in `other`. + + .. warning:: + This functionality is experimental and may change without it being + considered a breaking change. + + By default, null values in the right frame are ignored. Use + `include_nulls=False` to overwrite values in this frame with + null values in the other frame. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce + when `include_nulls = False` + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> df.update(new_df, how="inner") + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update( + ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with this value. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> DataFrame: + """ + Take every nth row in the DataFrame and return as a new DataFrame. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def find_idx_by_name(self, name: str) -> int: + """ + Find the index of a column by name. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`get_column_index`. + + Parameters + ---------- + name + Name of the column to find. + """ + def insert_at_idx(self, index: int, column: Series) -> Self: + """ + Insert a Series at a certain column index. This operation is in place. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`insert_column`. + + Parameters + ---------- + index + Column to insert the new `Series` column. + column + `Series` to insert. + """ + def replace_at_idx(self, index: int, new_column: Series) -> Self: + """ + Replace a column at an index location. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`replace_column`. + + Parameters + ---------- + index + Column index. + new_column + Series that will replace the column. + """ + def frame_equal(self, other: DataFrame) -> bool: + """ + Check whether the DataFrame is equal to another DataFrame. + + .. deprecated:: 0.19.16 + This method has been renamed to :func:`equals`. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + """ + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/expr/expr deleted file mode 100644 index 5131d44..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/expr/expr +++ /dev/null @@ -1,8289 +0,0 @@ -import P -import np as np -import pl -from builtins import PyExpr -from datetime import timedelta -from polars.datatypes.classes import Categorical as Categorical, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 -from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy -from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import no_default as no_default, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence - -TYPE_CHECKING: bool -py_arg_where: builtin_function_or_method -pyreduce: builtin_function_or_method - -class Expr: - _pyexpr: _ClassVar[None] = ... - _accessors: _ClassVar[set] = ... - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _repr_html_(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int | bool) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int | bool) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr | int | bool) -> Self: ... - def __rxor__(self, other: Any) -> Self: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: - """Numpy universal functions.""" - @classmethod - def from_json(cls, value: str) -> Self: - """ - Read an expression from a JSON encoded string to construct an Expression. - - Parameters - ---------- - value - JSON encoded string value - - """ - def to_physical(self) -> Self: - ''' - Cast to physical representation of the logical dtype. - - - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` - - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` - - `List(inner)` -> `List(physical of inner)` - - Other data types will be left unchanged. - - Examples - -------- - Replicating the pandas - `pd.factorize - `_ - function. - - >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( - ... [ - ... pl.col("vals").cast(pl.Categorical), - ... pl.col("vals") - ... .cast(pl.Categorical) - ... .to_physical() - ... .alias("vals_physical"), - ... ] - ... ) - shape: (4, 2) - ┌──────┬───────────────┐ - │ vals ┆ vals_physical │ - │ --- ┆ --- │ - │ cat ┆ u32 │ - ╞══════╪═══════════════╡ - │ a ┆ 0 │ - │ x ┆ 1 │ - │ null ┆ null │ - │ a ┆ 0 │ - └──────┴───────────────┘ - - ''' - def any(self) -> Self: - ''' - Return whether any of the values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is null. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [True, False], - ... "b": [False, False], - ... "c": [None, False], - ... } - ... ) - >>> df.select(pl.col("*").any()) - shape: (1, 3) - ┌──────┬───────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪═══════╡ - │ true ┆ false ┆ false │ - └──────┴───────┴───────┘ - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> df.select(pl.col("*").any(ignore_nulls=False)) - shape: (1, 3) - ┌──────┬───────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪══════╡ - │ true ┆ false ┆ null │ - └──────┴───────┴──────┘ - - ''' - def all(self) -> Self: - ''' - Return whether all values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - .. note:: - This method is not to be confused with the function :func:`polars.all`, - which can be used to select all columns. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is null. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [True, True], - ... "b": [False, True], - ... "c": [None, True], - ... } - ... ) - >>> df.select(pl.col("*").all()) - shape: (1, 3) - ┌──────┬───────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪══════╡ - │ true ┆ false ┆ true │ - └──────┴───────┴──────┘ - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> df.select(pl.col("*").all(ignore_nulls=False)) - shape: (1, 3) - ┌──────┬───────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪══════╡ - │ true ┆ false ┆ null │ - └──────┴───────┴──────┘ - - ''' - def arg_true(self) -> Self: - ''' - Return indices where expression evaluates `True`. - - .. warning:: - Modifies number of rows returned, so will fail in combination with other - expressions. Use as only expression in `select` / `with_columns`. - - See Also - -------- - Series.arg_true : Return indices where Series is True - polars.arg_where - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) - >>> df.select((pl.col("a") == 1).arg_true()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 0 │ - │ 1 │ - │ 3 │ - └─────┘ - - ''' - def sqrt(self) -> Self: - ''' - Compute the square root of the elements. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").sqrt()) - shape: (3, 1) - ┌──────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.0 │ - │ 1.414214 │ - │ 2.0 │ - └──────────┘ - - ''' - def cbrt(self) -> Self: - ''' - Compute the cube root of the elements. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").cbrt()) - shape: (3, 1) - ┌──────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.0 │ - │ 1.259921 │ - │ 1.587401 │ - └──────────┘ - - ''' - def log10(self) -> Self: - ''' - Compute the base 10 logarithm of the input array, element-wise. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").log10()) - shape: (3, 1) - ┌─────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞═════════╡ - │ 0.0 │ - │ 0.30103 │ - │ 0.60206 │ - └─────────┘ - - ''' - def exp(self) -> Self: - ''' - Compute the exponential, element-wise. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").exp()) - shape: (3, 1) - ┌──────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 2.718282 │ - │ 7.389056 │ - │ 54.59815 │ - └──────────┘ - - ''' - def alias(self, name: str) -> Self: - ''' - Rename the expression. - - Parameters - ---------- - name - The new name. - - See Also - -------- - map - prefix - suffix - - Examples - -------- - Rename an expression to avoid overwriting an existing column. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["x", "y", "z"], - ... } - ... ) - >>> df.with_columns( - ... pl.col("a") + 10, - ... pl.col("b").str.to_uppercase().alias("c"), - ... ) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 11 ┆ x ┆ X │ - │ 12 ┆ y ┆ Y │ - │ 13 ┆ z ┆ Z │ - └─────┴─────┴─────┘ - - Overwrite the default name of literal columns to prevent errors due to duplicate - column names. - - >>> df.with_columns( - ... pl.lit(True).alias("c"), - ... pl.lit(4.0).alias("d"), - ... ) - shape: (3, 4) - ┌─────┬─────┬──────┬─────┐ - │ a ┆ b ┆ c ┆ d │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ bool ┆ f64 │ - ╞═════╪═════╪══════╪═════╡ - │ 1 ┆ x ┆ true ┆ 4.0 │ - │ 2 ┆ y ┆ true ┆ 4.0 │ - │ 3 ┆ z ┆ true ┆ 4.0 │ - └─────┴─────┴──────┴─────┘ - - ''' - def map_alias(self, function: Callable[[str], str]) -> Self: - ''' - Rename the output of an expression by mapping a function over the root name. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.map`. - - Parameters - ---------- - function - Function that maps a root name to a new name. - - See Also - -------- - keep_name - prefix - suffix - - Examples - -------- - Remove a common suffix and convert to lower case. - - >>> df = pl.DataFrame( - ... { - ... "A_reverse": [3, 2, 1], - ... "B_reverse": ["z", "y", "x"], - ... } - ... ) - >>> df.with_columns( - ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) - ... ) - shape: (3, 4) - ┌───────────┬───────────┬─────┬─────┐ - │ A_reverse ┆ B_reverse ┆ a ┆ b │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═══════════╪═══════════╪═════╪═════╡ - │ 3 ┆ z ┆ 1 ┆ x │ - │ 2 ┆ y ┆ 2 ┆ y │ - │ 1 ┆ x ┆ 3 ┆ z │ - └───────────┴───────────┴─────┴─────┘ - - ''' - def prefix(self, prefix: str) -> Self: - ''' - Add a prefix to the root column name of the expression. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.prefix`. - - Parameters - ---------- - prefix - Prefix to add to the root column name. - - Notes - ----- - This will undo any previous renaming operations on the expression. - - Due to implementation constraints, this method can only be called as the last - expression in a chain. - - See Also - -------- - suffix - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["x", "y", "z"], - ... } - ... ) - >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) - shape: (3, 4) - ┌─────┬─────┬───────────┬───────────┐ - │ a ┆ b ┆ reverse_a ┆ reverse_b │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪═════╪═══════════╪═══════════╡ - │ 1 ┆ x ┆ 3 ┆ z │ - │ 2 ┆ y ┆ 2 ┆ y │ - │ 3 ┆ z ┆ 1 ┆ x │ - └─────┴─────┴───────────┴───────────┘ - - ''' - def suffix(self, suffix: str) -> Self: - ''' - Add a suffix to the root column name of the expression. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.suffix`. - - Parameters - ---------- - suffix - Suffix to add to the root column name. - - Notes - ----- - This will undo any previous renaming operations on the expression. - - Due to implementation constraints, this method can only be called as the last - expression in a chain. - - See Also - -------- - prefix - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["x", "y", "z"], - ... } - ... ) - >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) - shape: (3, 4) - ┌─────┬─────┬───────────┬───────────┐ - │ a ┆ b ┆ a_reverse ┆ b_reverse │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪═════╪═══════════╪═══════════╡ - │ 1 ┆ x ┆ 3 ┆ z │ - │ 2 ┆ y ┆ 2 ┆ y │ - │ 3 ┆ z ┆ 1 ┆ x │ - └─────┴─────┴───────────┴───────────┘ - - ''' - def keep_name(self) -> Self: - ''' - Keep the original root name of the expression. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.keep`. - - Notes - ----- - Due to implementation constraints, this method can only be called as the last - expression in a chain. - - See Also - -------- - alias - - Examples - -------- - Undo an alias operation. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2], - ... "b": [3, 4], - ... } - ... ) - >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 9 ┆ 3 │ - │ 18 ┆ 4 │ - └─────┴─────┘ - - Prevent errors due to duplicate column names. - - >>> df.select((pl.lit(10) / pl.all()).name.keep()) - shape: (2, 2) - ┌──────┬──────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪══════════╡ - │ 10.0 ┆ 3.333333 │ - │ 5.0 ┆ 2.5 │ - └──────┴──────────┘ - - ''' - def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: - ''' - Exclude columns from a multi-column expression. - - Only works after a wildcard or regex column selection, and you cannot provide - both string column names *and* dtypes (you may prefer to use selectors instead). - - Parameters - ---------- - columns - The name or datatype of the column(s) to exclude. Accepts regular expression - input. Regular expressions should start with `^` and end with `$`. - *more_columns - Additional names or datatypes of columns to exclude, specified as positional - arguments. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "aa": [1, 2, 3], - ... "ba": ["a", "b", None], - ... "cc": [None, 2.5, 1.5], - ... } - ... ) - >>> df - shape: (3, 3) - ┌─────┬──────┬──────┐ - │ aa ┆ ba ┆ cc │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ f64 │ - ╞═════╪══════╪══════╡ - │ 1 ┆ a ┆ null │ - │ 2 ┆ b ┆ 2.5 │ - │ 3 ┆ null ┆ 1.5 │ - └─────┴──────┴──────┘ - - Exclude by column name(s): - - >>> df.select(pl.all().exclude("ba")) - shape: (3, 2) - ┌─────┬──────┐ - │ aa ┆ cc │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ null │ - │ 2 ┆ 2.5 │ - │ 3 ┆ 1.5 │ - └─────┴──────┘ - - Exclude by regex, e.g. removing all columns whose names end with the letter "a": - - >>> df.select(pl.all().exclude("^.*a$")) - shape: (3, 1) - ┌──────┐ - │ cc │ - │ --- │ - │ f64 │ - ╞══════╡ - │ null │ - │ 2.5 │ - │ 1.5 │ - └──────┘ - - Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: - - >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) - shape: (3, 1) - ┌──────┐ - │ ba │ - │ --- │ - │ str │ - ╞══════╡ - │ a │ - │ b │ - │ null │ - └──────┘ - - ''' - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: - ''' - Offers a structured way to apply a sequence of user-defined functions (UDFs). - - Parameters - ---------- - function - Callable; will receive the expression as the first parameter, - followed by any given args/kwargs. - *args - Arguments to pass to the UDF. - **kwargs - Keyword arguments to pass to the UDF. - - Examples - -------- - >>> def extract_number(expr: pl.Expr) -> pl.Expr: - ... """Extract the digits from a string.""" - ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) - >>> - >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: - ... """Set even numbers negative, and scale by a user-supplied value.""" - ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) - ... return expr * n - >>> - >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) - >>> df.with_columns( - ... udfs=( - ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) - ... ), - ... ) - shape: (4, 2) - ┌──────┬──────┐ - │ val ┆ udfs │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞══════╪══════╡ - │ a: 1 ┆ 5 │ - │ b: 2 ┆ -10 │ - │ c: 3 ┆ 15 │ - │ d: 4 ┆ -20 │ - └──────┴──────┘ - - ''' - def is_not(self) -> Self: - """ - Negate a boolean expression. - - .. deprecated:: 0.19.2 - This method has been renamed to :func:`Expr.not_`. - - """ - def not_(self) -> Self: - ''' - Negate a boolean expression. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [True, False, False], - ... "b": ["a", "b", None], - ... } - ... ) - >>> df - shape: (3, 2) - ┌───────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ bool ┆ str │ - ╞═══════╪══════╡ - │ true ┆ a │ - │ false ┆ b │ - │ false ┆ null │ - └───────┴──────┘ - >>> df.select(pl.col("a").not_()) - shape: (3, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ true │ - │ true │ - └───────┘ - - ''' - def is_null(self) -> Self: - ''' - Returns a boolean Series indicating which values are null. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null - shape: (5, 4) - ┌──────┬─────┬──────────┬──────────┐ - │ a ┆ b ┆ a_isnull ┆ b_isnull │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪═════╪══════════╪══════════╡ - │ 1 ┆ 1.0 ┆ false ┆ false │ - │ 2 ┆ 2.0 ┆ false ┆ false │ - │ null ┆ NaN ┆ true ┆ false │ - │ 1 ┆ 1.0 ┆ false ┆ false │ - │ 5 ┆ 5.0 ┆ false ┆ false │ - └──────┴─────┴──────────┴──────────┘ - - ''' - def is_not_null(self) -> Self: - ''' - Returns a boolean Series indicating which values are not null. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns( - ... pl.all().is_not_null().name.suffix("_not_null") # nan != null - ... ) - shape: (5, 4) - ┌──────┬─────┬────────────┬────────────┐ - │ a ┆ b ┆ a_not_null ┆ b_not_null │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪═════╪════════════╪════════════╡ - │ 1 ┆ 1.0 ┆ true ┆ true │ - │ 2 ┆ 2.0 ┆ true ┆ true │ - │ null ┆ NaN ┆ false ┆ true │ - │ 1 ┆ 1.0 ┆ true ┆ true │ - │ 5 ┆ 5.0 ┆ true ┆ true │ - └──────┴─────┴────────────┴────────────┘ - - ''' - def is_finite(self) -> Self: - ''' - Returns a boolean Series indicating which values are finite. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1.0, 2], - ... "B": [3.0, float("inf")], - ... } - ... ) - >>> df.select(pl.all().is_finite()) - shape: (2, 2) - ┌──────┬───────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ bool ┆ bool │ - ╞══════╪═══════╡ - │ true ┆ true │ - │ true ┆ false │ - └──────┴───────┘ - - ''' - def is_infinite(self) -> Self: - ''' - Returns a boolean Series indicating which values are infinite. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1.0, 2], - ... "B": [3.0, float("inf")], - ... } - ... ) - >>> df.select(pl.all().is_infinite()) - shape: (2, 2) - ┌───────┬───────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ bool ┆ bool │ - ╞═══════╪═══════╡ - │ false ┆ false │ - │ false ┆ true │ - └───────┴───────┘ - - ''' - def is_nan(self) -> Self: - ''' - Returns a boolean Series indicating which values are NaN. - - Notes - ----- - Floating point `NaN` (Not A Number) should not be confused - with missing data represented as `Null/None`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) - shape: (5, 3) - ┌──────┬─────┬─────────┐ - │ a ┆ b ┆ b_isnan │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪═════╪═════════╡ - │ 1 ┆ 1.0 ┆ false │ - │ 2 ┆ 2.0 ┆ false │ - │ null ┆ NaN ┆ true │ - │ 1 ┆ 1.0 ┆ false │ - │ 5 ┆ 5.0 ┆ false │ - └──────┴─────┴─────────┘ - - ''' - def is_not_nan(self) -> Self: - ''' - Returns a boolean Series indicating which values are not NaN. - - Notes - ----- - Floating point `NaN` (Not A Number) should not be confused - with missing data represented as `Null/None`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) - shape: (5, 3) - ┌──────┬─────┬──────────────┐ - │ a ┆ b ┆ b_is_not_nan │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪═════╪══════════════╡ - │ 1 ┆ 1.0 ┆ true │ - │ 2 ┆ 2.0 ┆ true │ - │ null ┆ NaN ┆ false │ - │ 1 ┆ 1.0 ┆ true │ - │ 5 ┆ 5.0 ┆ true │ - └──────┴─────┴──────────────┘ - - ''' - def agg_groups(self) -> Self: - ''' - Get the group indexes of the group by operation. - - Should be used in aggregation context only. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": [ - ... "one", - ... "one", - ... "one", - ... "two", - ... "two", - ... "two", - ... ], - ... "value": [94, 95, 96, 97, 97, 99], - ... } - ... ) - >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ list[u32] │ - ╞═══════╪═══════════╡ - │ one ┆ [0, 1, 2] │ - │ two ┆ [3, 4, 5] │ - └───────┴───────────┘ - - ''' - def count(self) -> Self: - ''' - Return the number of elements in the column. - - .. warning:: - Null values are treated like regular elements in this context. - - Examples - -------- - >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) - >>> df.select(pl.all().count()) - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 3 ┆ 3 │ - └─────┴─────┘ - - ''' - def len(self) -> Self: - ''' - Return the number of elements in the column. - - Null values are treated like regular elements in this context. - - Alias for :func:`count`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) - >>> df.select(pl.all().len()) - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 3 ┆ 3 │ - └─────┴─────┘ - - ''' - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: - ''' - Get a slice of this expression. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [8, 9, 10, 11], - ... "b": [None, 4, 4, 4], - ... } - ... ) - >>> df.select(pl.all().slice(1, 2)) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 9 ┆ 4 │ - │ 10 ┆ 4 │ - └─────┴─────┘ - - ''' - def append(self, other: IntoExpr) -> Self: - ''' - Append expressions. - - This is done by adding the chunks of `other` to this `Series`. - - Parameters - ---------- - other - Expression to append. - upcast - Cast both `Series` to the same supertype. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [8, 9, 10], - ... "b": [None, 4, 4], - ... } - ... ) - >>> df.select(pl.all().head(1).append(pl.all().tail(1))) - shape: (2, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 8 ┆ null │ - │ 10 ┆ 4 │ - └─────┴──────┘ - - ''' - def rechunk(self) -> Self: - ''' - Create a single chunk of memory for this Series. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - - Create a Series with 3 nulls, append column a then rechunk - - >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) - shape: (6, 1) - ┌────────┐ - │ repeat │ - │ --- │ - │ i64 │ - ╞════════╡ - │ null │ - │ null │ - │ null │ - │ 1 │ - │ 1 │ - │ 2 │ - └────────┘ - - ''' - def drop_nulls(self) -> Self: - ''' - Drop all null values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nans - - Notes - ----- - A null value is not the same as a NaN value. - To drop NaN values, use :func:`drop_nans`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) - >>> df.select(pl.col("a").drop_nulls()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - │ 3.0 │ - │ NaN │ - └─────┘ - - ''' - def drop_nans(self) -> Self: - ''' - Drop all floating point NaN values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nulls - - Notes - ----- - A NaN value is not the same as a null value. - To drop null values, use :func:`drop_nulls`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) - >>> df.select(pl.col("a").drop_nans()) - shape: (3, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ 1.0 │ - │ null │ - │ 3.0 │ - └──────┘ - - ''' - def cum_sum(self) -> Self: - ''' - Get an array with the cumulative sum computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_sum().alias("cum_sum"), - ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), - ... ) - shape: (4, 3) - ┌─────┬─────────┬─────────────────┐ - │ a ┆ cum_sum ┆ cum_sum_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════════╪═════════════════╡ - │ 1 ┆ 1 ┆ 10 │ - │ 2 ┆ 3 ┆ 9 │ - │ 3 ┆ 6 ┆ 7 │ - │ 4 ┆ 10 ┆ 4 │ - └─────┴─────────┴─────────────────┘ - - Null values are excluded, but can also be filled by calling `forward_fill`. - - >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) - >>> df.with_columns( - ... pl.col("values").cum_sum().alias("value_cum_sum"), - ... pl.col("values") - ... .cum_sum() - ... .forward_fill() - ... .alias("value_cum_sum_all_filled"), - ... ) - shape: (8, 3) - ┌────────┬───────────────┬──────────────────────────┐ - │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞════════╪═══════════════╪══════════════════════════╡ - │ null ┆ null ┆ null │ - │ 10 ┆ 10 ┆ 10 │ - │ null ┆ null ┆ 10 │ - │ 8 ┆ 18 ┆ 18 │ - │ 9 ┆ 27 ┆ 27 │ - │ null ┆ null ┆ 27 │ - │ 16 ┆ 43 ┆ 43 │ - │ null ┆ null ┆ 43 │ - └────────┴───────────────┴──────────────────────────┘ - - ''' - def cum_prod(self) -> Self: - ''' - Get an array with the cumulative product computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_prod().alias("cum_prod"), - ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), - ... ) - shape: (4, 3) - ┌─────┬──────────┬──────────────────┐ - │ a ┆ cum_prod ┆ cum_prod_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪══════════╪══════════════════╡ - │ 1 ┆ 1 ┆ 24 │ - │ 2 ┆ 2 ┆ 24 │ - │ 3 ┆ 6 ┆ 12 │ - │ 4 ┆ 24 ┆ 4 │ - └─────┴──────────┴──────────────────┘ - - ''' - def cum_min(self) -> Self: - ''' - Get an array with the cumulative min computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_min().alias("cum_min"), - ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), - ... ) - shape: (4, 3) - ┌─────┬─────────┬─────────────────┐ - │ a ┆ cum_min ┆ cum_min_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════════╪═════════════════╡ - │ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 1 ┆ 2 │ - │ 3 ┆ 1 ┆ 3 │ - │ 4 ┆ 1 ┆ 4 │ - └─────┴─────────┴─────────────────┘ - - ''' - def cum_max(self) -> Self: - ''' - Get an array with the cumulative max computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_max().alias("cum_max"), - ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), - ... ) - shape: (4, 3) - ┌─────┬─────────┬─────────────────┐ - │ a ┆ cum_max ┆ cum_max_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════════╪═════════════════╡ - │ 1 ┆ 1 ┆ 4 │ - │ 2 ┆ 2 ┆ 4 │ - │ 3 ┆ 3 ┆ 4 │ - │ 4 ┆ 4 ┆ 4 │ - └─────┴─────────┴─────────────────┘ - - Null values are excluded, but can also be filled by calling `forward_fill`. - - >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) - >>> df.with_columns( - ... pl.col("values").cum_max().alias("cum_max"), - ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), - ... ) - shape: (8, 3) - ┌────────┬─────────┬────────────────────┐ - │ values ┆ cum_max ┆ cum_max_all_filled │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞════════╪═════════╪════════════════════╡ - │ null ┆ null ┆ null │ - │ 10 ┆ 10 ┆ 10 │ - │ null ┆ null ┆ 10 │ - │ 8 ┆ 10 ┆ 10 │ - │ 9 ┆ 10 ┆ 10 │ - │ null ┆ null ┆ 10 │ - │ 16 ┆ 16 ┆ 16 │ - │ null ┆ null ┆ 16 │ - └────────┴─────────┴────────────────────┘ - - ''' - def cum_count(self) -> Self: - ''' - Get an array with the cumulative count computed at every element. - - Counting from 0 to len - - Parameters - ---------- - reverse - Reverse the operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_count().alias("cum_count"), - ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), - ... ) - shape: (4, 3) - ┌─────┬───────────┬───────────────────┐ - │ a ┆ cum_count ┆ cum_count_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ u32 ┆ u32 │ - ╞═════╪═══════════╪═══════════════════╡ - │ 1 ┆ 0 ┆ 3 │ - │ 2 ┆ 1 ┆ 2 │ - │ 3 ┆ 2 ┆ 1 │ - │ 4 ┆ 3 ┆ 0 │ - └─────┴───────────┴───────────────────┘ - - ''' - def floor(self) -> Self: - ''' - Rounds down to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) - >>> df.select(pl.col("a").floor()) - shape: (4, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - │ 0.0 │ - │ 1.0 │ - │ 1.0 │ - └─────┘ - - ''' - def ceil(self) -> Self: - ''' - Rounds up to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) - >>> df.select(pl.col("a").ceil()) - shape: (4, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - │ 1.0 │ - │ 1.0 │ - │ 2.0 │ - └─────┘ - - ''' - def round(self, decimals: int = ...) -> Self: - ''' - Round underlying floating point data by `decimals` digits. - - Parameters - ---------- - decimals - Number of decimals to round by. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) - >>> df.select(pl.col("a").round(1)) - shape: (4, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.3 │ - │ 0.5 │ - │ 1.0 │ - │ 1.2 │ - └─────┘ - - ''' - def round_sig_figs(self, digits: int) -> Self: - ''' - Round to a number of significant figures. - - Parameters - ---------- - digits - Number of significant figures to round to. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) - >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) - shape: (3, 2) - ┌─────────┬────────────────┐ - │ a ┆ round_sig_figs │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════════╪════════════════╡ - │ 0.01234 ┆ 0.012 │ - │ 3.333 ┆ 3.3 │ - │ 1234.0 ┆ 1200.0 │ - └─────────┴────────────────┘ - - ''' - def dot(self, other: Expr | str) -> Self: - ''' - Compute the dot/inner product between two Expressions. - - Parameters - ---------- - other - Expression to compute dot product with. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> df.select(pl.col("a").dot(pl.col("b"))) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 44 │ - └─────┘ - - ''' - def mode(self) -> Self: - ''' - Compute the most occurring value(s). - - Can return multiple Values. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 1, 2, 3], - ... "b": [1, 1, 2, 2], - ... } - ... ) - >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 1 │ - │ 1 ┆ 2 │ - └─────┴─────┘ - - ''' - def cast(self, dtype: PolarsDataType | type[Any]) -> Self: - ''' - Cast between data types. - - Parameters - ---------- - dtype - DataType to cast to. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["4", "5", "6"], - ... } - ... ) - >>> df.with_columns( - ... [ - ... pl.col("a").cast(pl.Float64), - ... pl.col("b").cast(pl.Int32), - ... ] - ... ) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ i32 │ - ╞═════╪═════╡ - │ 1.0 ┆ 4 │ - │ 2.0 ┆ 5 │ - │ 3.0 ┆ 6 │ - └─────┴─────┘ - - ''' - def sort(self) -> Self: - ''' - Sort this column. - - When used in a projection/selection context, the whole column is sorted. - When used in a group by context, the groups are sorted. - - Parameters - ---------- - descending - Sort in descending order. - nulls_last - Place null values last. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, None, 3, 2], - ... } - ... ) - >>> df.select(pl.col("a").sort()) - shape: (4, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ null │ - │ 1 │ - │ 2 │ - │ 3 │ - └──────┘ - >>> df.select(pl.col("a").sort(descending=True)) - shape: (4, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ null │ - │ 3 │ - │ 2 │ - │ 1 │ - └──────┘ - >>> df.select(pl.col("a").sort(nulls_last=True)) - shape: (4, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ 1 │ - │ 2 │ - │ 3 │ - │ null │ - └──────┘ - - When sorting in a group by context, the groups are sorted. - - >>> df = pl.DataFrame( - ... { - ... "group": ["one", "one", "one", "two", "two", "two"], - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT - shape: (2, 2) - ┌───────┬────────────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪════════════╡ - │ two ┆ [3, 4, 99] │ - │ one ┆ [1, 2, 98] │ - └───────┴────────────┘ - - ''' - def top_k(self, k: int | IntoExprColumn = ...) -> Self: - ''' - Return the `k` largest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - bottom_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("value").top_k().alias("top_k"), - ... pl.col("value").bottom_k().alias("bottom_k"), - ... ] - ... ) - shape: (5, 2) - ┌───────┬──────────┐ - │ top_k ┆ bottom_k │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═══════╪══════════╡ - │ 99 ┆ 1 │ - │ 98 ┆ 2 │ - │ 4 ┆ 3 │ - │ 3 ┆ 4 │ - │ 2 ┆ 98 │ - └───────┴──────────┘ - - ''' - def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: - ''' - Return the `k` smallest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - top_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("value").top_k().alias("top_k"), - ... pl.col("value").bottom_k().alias("bottom_k"), - ... ] - ... ) - shape: (5, 2) - ┌───────┬──────────┐ - │ top_k ┆ bottom_k │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═══════╪══════════╡ - │ 99 ┆ 1 │ - │ 98 ┆ 2 │ - │ 4 ┆ 3 │ - │ 3 ┆ 4 │ - │ 2 ┆ 98 │ - └───────┴──────────┘ - - ''' - def arg_sort(self) -> Self: - ''' - Get the index values that would sort this column. - - Parameters - ---------- - descending - Sort in descending (descending) order. - nulls_last - Place null values last instead of first. - - Returns - ------- - Expr - Expression of data type :class:`UInt32`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [20, 10, 30], - ... } - ... ) - >>> df.select(pl.col("a").arg_sort()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 1 │ - │ 0 │ - │ 2 │ - └─────┘ - - ''' - def arg_max(self) -> Self: - ''' - Get the index of the maximal value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [20, 10, 30], - ... } - ... ) - >>> df.select(pl.col("a").arg_max()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def arg_min(self) -> Self: - ''' - Get the index of the minimal value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [20, 10, 30], - ... } - ... ) - >>> df.select(pl.col("a").arg_min()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 1 │ - └─────┘ - - ''' - def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: - ''' - Find indices where elements should be inserted to maintain order. - - .. math:: a[i-1] < v <= a[i] - - Parameters - ---------- - element - Expression or scalar value. - side : {\'any\', \'left\', \'right\'} - If \'any\', the index of the first suitable location found is given. - If \'left\', the index of the leftmost suitable location found is given. - If \'right\', return the rightmost suitable location found is given. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "values": [1, 2, 3, 5], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("values").search_sorted(0).alias("zero"), - ... pl.col("values").search_sorted(3).alias("three"), - ... pl.col("values").search_sorted(6).alias("six"), - ... ] - ... ) - shape: (1, 3) - ┌──────┬───────┬─────┐ - │ zero ┆ three ┆ six │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ u32 │ - ╞══════╪═══════╪═════╡ - │ 0 ┆ 2 ┆ 4 │ - └──────┴───────┴─────┘ - - ''' - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: - ''' - Sort this column by the ordering of other columns. - - When used in a projection/selection context, the whole column is sorted. - When used in a group by context, the groups are sorted. - - Parameters - ---------- - by - Column(s) to sort by. Accepts expression input. Strings are parsed as column - names. - *more_by - Additional columns to sort by, specified as positional arguments. - descending - Sort in descending order. When sorting by multiple columns, can be specified - per column by passing a sequence of booleans. - - Examples - -------- - Pass a single column name to sort by that column. - - >>> df = pl.DataFrame( - ... { - ... "group": ["a", "a", "b", "b"], - ... "value1": [1, 3, 4, 2], - ... "value2": [8, 7, 6, 5], - ... } - ... ) - >>> df.select(pl.col("group").sort_by("value1")) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ a │ - │ b │ - │ a │ - │ b │ - └───────┘ - - Sorting by expressions is also supported. - - >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ b │ - │ a │ - │ a │ - │ b │ - └───────┘ - - Sort by multiple columns by passing a list of columns. - - >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ b │ - │ a │ - │ b │ - │ a │ - └───────┘ - - Or use positional arguments to sort by multiple columns in the same way. - - >>> df.select(pl.col("group").sort_by("value1", "value2")) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ a │ - │ b │ - │ a │ - │ b │ - └───────┘ - - When sorting in a group by context, the groups are sorted. - - >>> df.group_by("group").agg( - ... pl.col("value1").sort_by("value2") - ... ) # doctest: +IGNORE_RESULT - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ value1 │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪═══════════╡ - │ a ┆ [3, 1] │ - │ b ┆ [2, 4] │ - └───────┴───────────┘ - - Take a single row from each group where a column attains its minimal value - within that group. - - >>> df.group_by("group").agg( - ... pl.all().sort_by("value2").first() - ... ) # doctest: +IGNORE_RESULT - shape: (2, 3) - ┌───────┬────────┬────────┐ - │ group ┆ value1 ┆ value2 | - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 | - ╞═══════╪════════╪════════╡ - │ a ┆ 3 ┆ 7 | - │ b ┆ 2 ┆ 5 | - └───────┴────────┴────────┘ - - ''' - def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: - ''' - Take values by index. - - Parameters - ---------- - indices - An expression that leads to a UInt32 dtyped Series. - - Returns - ------- - Expr - Expression of the same data type. - - See Also - -------- - Expr.get : Take a single value - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": [ - ... "one", - ... "one", - ... "one", - ... "two", - ... "two", - ... "two", - ... ], - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.group_by("group", maintain_order=True).agg( - ... pl.col("value").gather([2, 1]) - ... ) - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪═══════════╡ - │ one ┆ [2, 98] │ - │ two ┆ [4, 99] │ - └───────┴───────────┘ - ''' - def get(self, index: int | Expr) -> Self: - ''' - Return a single value by index. - - Parameters - ---------- - index - An expression that leads to a UInt32 index. - - Returns - ------- - Expr - Expression of the same data type. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": [ - ... "one", - ... "one", - ... "one", - ... "two", - ... "two", - ... "two", - ... ], - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) - shape: (2, 2) - ┌───────┬───────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═══════╪═══════╡ - │ one ┆ 98 │ - │ two ┆ 99 │ - └───────┴───────┘ - - ''' - def shift(self, n: int | IntoExprColumn = ...) -> Self: - ''' - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns(shift=pl.col("a").shift()) - shape: (4, 2) - ┌─────┬───────┐ - │ a ┆ shift │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═══════╡ - │ 1 ┆ null │ - │ 2 ┆ 1 │ - │ 3 ┆ 2 │ - │ 4 ┆ 3 │ - └─────┴───────┘ - - Pass a negative value to shift in the opposite direction instead. - - >>> df.with_columns(shift=pl.col("a").shift(-2)) - shape: (4, 2) - ┌─────┬───────┐ - │ a ┆ shift │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═══════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - │ 3 ┆ null │ - │ 4 ┆ null │ - └─────┴───────┘ - - Specify `fill_value` to fill the resulting null values. - - >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) - shape: (4, 2) - ┌─────┬───────┐ - │ a ┆ shift │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═══════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - │ 3 ┆ 100 │ - │ 4 ┆ 100 │ - └─────┴───────┘ - - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: - ''' - Fill null values using the specified value or strategy. - - To interpolate over null values see interpolate. - See the examples below to fill nulls with an expression. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [4, None, 6], - ... } - ... ) - >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 0 │ - │ null ┆ 6 │ - └──────┴─────┘ - >>> df.with_columns(pl.col("b").fill_null(99)) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 99 │ - │ null ┆ 6 │ - └──────┴─────┘ - >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 4 │ - │ null ┆ 6 │ - └──────┴─────┘ - >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞══════╪═════╡ - │ 1 ┆ 4.0 │ - │ 2 ┆ 5.0 │ - │ null ┆ 6.0 │ - └──────┴─────┘ - >>> df.with_columns(pl.all().fill_null(pl.all().median())) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 1.0 ┆ 4.0 │ - │ 2.0 ┆ 5.0 │ - │ 1.5 ┆ 6.0 │ - └─────┴─────┘ - - ''' - def fill_nan(self, value: int | float | Expr | None) -> Self: - ''' - Fill floating point NaN value with a fill value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1.0, None, float("nan")], - ... "b": [4.0, float("nan"), 6], - ... } - ... ) - >>> df.with_columns(pl.col("b").fill_nan(0)) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪═════╡ - │ 1.0 ┆ 4.0 │ - │ null ┆ 0.0 │ - │ NaN ┆ 6.0 │ - └──────┴─────┘ - - ''' - def forward_fill(self, limit: int | None = ...) -> Self: - ''' - Fill missing values with the latest seen values. - - Parameters - ---------- - limit - The number of consecutive null values to forward fill. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [4, None, 6], - ... } - ... ) - >>> df.select(pl.all().forward_fill()) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 4 │ - │ 2 ┆ 6 │ - └─────┴─────┘ - - ''' - def backward_fill(self, limit: int | None = ...) -> Self: - ''' - Fill missing values with the next to be seen values. - - Parameters - ---------- - limit - The number of consecutive null values to backward fill. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [4, None, 6], - ... "c": [None, None, 2], - ... } - ... ) - >>> df.select(pl.all().backward_fill()) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞══════╪═════╪═════╡ - │ 1 ┆ 4 ┆ 2 │ - │ 2 ┆ 6 ┆ 2 │ - │ null ┆ 6 ┆ 2 │ - └──────┴─────┴─────┘ - >>> df.select(pl.all().backward_fill(limit=1)) - shape: (3, 3) - ┌──────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞══════╪═════╪══════╡ - │ 1 ┆ 4 ┆ null │ - │ 2 ┆ 6 ┆ 2 │ - │ null ┆ 6 ┆ 2 │ - └──────┴─────┴──────┘ - - ''' - def reverse(self) -> Self: - ''' - Reverse the selection. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4, 5], - ... "fruits": ["banana", "banana", "apple", "apple", "banana"], - ... "B": [5, 4, 3, 2, 1], - ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], - ... } - ... ) - >>> df.select( - ... [ - ... pl.all(), - ... pl.all().reverse().name.suffix("_reverse"), - ... ] - ... ) - shape: (5, 8) - ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ - │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ - │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ - │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ - │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ - │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ - │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ - └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ - - ''' - def std(self, ddof: int = ...) -> Self: - ''' - Get standard deviation. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").std()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def var(self, ddof: int = ...) -> Self: - ''' - Get variance. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").var()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def max(self) -> Self: - ''' - Get maximum value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) - >>> df.select(pl.col("a").max()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def min(self) -> Self: - ''' - Get minimum value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) - >>> df.select(pl.col("a").min()) - shape: (1, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ -1.0 │ - └──────┘ - - ''' - def nan_max(self) -> Self: - ''' - Get maximum value, but propagate/poison encountered NaN values. - - This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0, float("nan")]}) - >>> df.select(pl.col("a").nan_max()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ NaN │ - └─────┘ - - ''' - def nan_min(self) -> Self: - ''' - Get minimum value, but propagate/poison encountered NaN values. - - This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0, float("nan")]}) - >>> df.select(pl.col("a").nan_min()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ NaN │ - └─────┘ - - ''' - def sum(self) -> Self: - ''' - Get sum value. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").sum()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 0 │ - └─────┘ - - ''' - def mean(self) -> Self: - ''' - Get mean value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").mean()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def median(self) -> Self: - ''' - Get median value using linear interpolation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").median()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def product(self) -> Self: - ''' - Compute the product of an expression. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").product()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 6 │ - └─────┘ - - ''' - def n_unique(self) -> Self: - ''' - Count unique values. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").n_unique()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def approx_n_unique(self) -> Self: - ''' - Approximate count of unique values. - - This is done using the HyperLogLog++ algorithm for cardinality estimation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").approx_n_unique()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def null_count(self) -> Self: - ''' - Count null values. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [None, 1, None], - ... "b": [1, 2, 3], - ... } - ... ) - >>> df.select(pl.all().null_count()) - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 2 ┆ 0 │ - └─────┴─────┘ - - ''' - def arg_unique(self) -> Self: - ''' - Get index of first unique value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [8, 9, 10], - ... "b": [None, 4, 4], - ... } - ... ) - >>> df.select(pl.col("a").arg_unique()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 0 │ - │ 1 │ - │ 2 │ - └─────┘ - >>> df.select(pl.col("b").arg_unique()) - shape: (2, 1) - ┌─────┐ - │ b │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 0 │ - │ 1 │ - └─────┘ - - ''' - def unique(self) -> Self: - ''' - Get unique values of this expression. - - Parameters - ---------- - maintain_order - Maintain order of data. This requires more work. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - │ 1 │ - └─────┘ - >>> df.select(pl.col("a").unique(maintain_order=True)) - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - └─────┘ - - ''' - def first(self) -> Self: - ''' - Get the first value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").first()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - └─────┘ - - ''' - def last(self) -> Self: - ''' - Get the last value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").last()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: - ''' - Compute expressions over the given groups. - - This expression is similar to performing a group by aggregation and joining the - result back into the original DataFrame. - - The outcome is similar to how `window functions - `_ - work in PostgreSQL. - - Parameters - ---------- - expr - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_exprs - Additional columns to group by, specified as positional arguments. - mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} - - group_to_rows - If the aggregation results in multiple values, assign them back to their - position in the DataFrame. This can only be done if the group yields - the same elements before aggregation as after. - - join - Join the groups as \'List\' to the row positions. - warning: this can be memory intensive. - - explode - Don\'t do any mapping, but simply flatten the group. - This only makes sense if the input data is sorted. - - Examples - -------- - Pass the name of a column to compute the expression over that column. - - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "a", "b", "b", "b"], - ... "b": [1, 2, 3, 5, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> df.with_columns( - ... pl.col("c").max().over("a").name.suffix("_max"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_max │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 5 │ - │ b ┆ 3 ┆ 3 ┆ 3 │ - │ b ┆ 5 ┆ 2 ┆ 3 │ - │ b ┆ 3 ┆ 1 ┆ 3 │ - └─────┴─────┴─────┴───────┘ - - Expression input is supported. - - >>> df.with_columns( - ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_max │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 4 │ - │ b ┆ 3 ┆ 3 ┆ 4 │ - │ b ┆ 5 ┆ 2 ┆ 2 │ - │ b ┆ 3 ┆ 1 ┆ 4 │ - └─────┴─────┴─────┴───────┘ - - Group by multiple columns by passing a list of column names or expressions. - - >>> df.with_columns( - ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_min │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 4 │ - │ b ┆ 3 ┆ 3 ┆ 1 │ - │ b ┆ 5 ┆ 2 ┆ 2 │ - │ b ┆ 3 ┆ 1 ┆ 1 │ - └─────┴─────┴─────┴───────┘ - - Or use positional arguments to group by multiple columns in the same way. - - >>> df.with_columns( - ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_min │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 4 │ - │ b ┆ 3 ┆ 3 ┆ 1 │ - │ b ┆ 5 ┆ 2 ┆ 1 │ - │ b ┆ 3 ┆ 1 ┆ 1 │ - └─────┴─────┴─────┴───────┘ - - ''' - def rolling(self, index_column: str) -> Self: - ''' - Create rolling groups based on a time, Int32, or Int64 column. - - If you have a time series ``, then by default the - windows created will be - - * (t_0 - period, t_0] - * (t_1 - period, t_1] - * ... - * (t_n - period, t_n] - - whereas if you pass a non-default `offset`, then the windows will be - - * (t_0 + offset, t_0 + offset + period] - * (t_1 + offset, t_1 + offset + period] - * ... - * (t_n + offset, t_n + offset + period] - - The `period` and `offset` arguments are created either from a timedelta, or - by using the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a rolling operation on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order. - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Examples - -------- - >>> dates = [ - ... "2020-01-01 13:45:48", - ... "2020-01-01 16:42:13", - ... "2020-01-01 16:45:09", - ... "2020-01-02 18:12:48", - ... "2020-01-03 19:45:32", - ... "2020-01-08 23:16:43", - ... ] - >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( - ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() - ... ) - >>> df.with_columns( - ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), - ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), - ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), - ... ) - shape: (6, 5) - ┌─────────────────────┬─────┬───────┬───────┬───────┐ - │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ - │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ - │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ - │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ - │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ - │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ - │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ - └─────────────────────┴─────┴───────┴───────┴───────┘ - - ''' - def is_unique(self) -> Self: - ''' - Get mask of unique values. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").is_unique()) - shape: (3, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ false │ - │ true │ - └───────┘ - - ''' - def is_first_distinct(self) -> Self: - ''' - Return a boolean mask indicating the first occurrence of each distinct value. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) - >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) - shape: (5, 2) - ┌─────┬───────┐ - │ a ┆ first │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪═══════╡ - │ 1 ┆ true │ - │ 1 ┆ false │ - │ 2 ┆ true │ - │ 3 ┆ true │ - │ 2 ┆ false │ - └─────┴───────┘ - - ''' - def is_last_distinct(self) -> Self: - ''' - Return a boolean mask indicating the last occurrence of each distinct value. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) - >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) - shape: (5, 2) - ┌─────┬───────┐ - │ a ┆ last │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪═══════╡ - │ 1 ┆ false │ - │ 1 ┆ true │ - │ 2 ┆ false │ - │ 3 ┆ true │ - │ 2 ┆ true │ - └─────┴───────┘ - - ''' - def is_duplicated(self) -> Self: - ''' - Return a boolean mask indicating duplicated values. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").is_duplicated()) - shape: (3, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ true │ - │ true │ - │ false │ - └───────┘ - - ''' - def peak_max(self) -> Self: - ''' - Get a boolean mask of the local maximum peaks. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) - >>> df.select(pl.col("a").peak_max()) - shape: (5, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ false │ - │ false │ - │ false │ - │ true │ - └───────┘ - - ''' - def peak_min(self) -> Self: - ''' - Get a boolean mask of the local minimum peaks. - - Examples - -------- - >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) - >>> df.select(pl.col("a").peak_min()) - shape: (5, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ true │ - │ false │ - │ true │ - │ false │ - └───────┘ - - ''' - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: - ''' - Get quantile value. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) - >>> df.select(pl.col("a").quantile(0.3)) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 2.0 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.5 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.5 │ - └─────┘ - - ''' - def cut(self, breaks: Sequence[float]) -> Self: - ''' - Bin continuous values into discrete categories. - - Parameters - ---------- - breaks - List of unique cut points. - labels - Names of the categories. The number of labels must be equal to the number - of cut points plus one. - left_closed - Set the intervals to be left-closed instead of right-closed. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - - Returns - ------- - Expr - Expression of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise an expression of data type :class:`Struct`. - - See Also - -------- - qcut - - Examples - -------- - Divide a column into three categories. - - >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) - >>> df.with_columns( - ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") - ... ) - shape: (5, 2) - ┌─────┬─────┐ - │ foo ┆ cut │ - │ --- ┆ --- │ - │ i64 ┆ cat │ - ╞═════╪═════╡ - │ -2 ┆ a │ - │ -1 ┆ a │ - │ 0 ┆ b │ - │ 1 ┆ b │ - │ 2 ┆ c │ - └─────┴─────┘ - - Add both the category and the breakpoint. - - >>> df.with_columns( - ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") - ... ).unnest("cut") - shape: (5, 3) - ┌─────┬──────┬────────────┐ - │ foo ┆ brk ┆ foo_bin │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪══════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴──────┴────────────┘ - - ''' - def qcut(self, quantiles: Sequence[float] | int) -> Self: - ''' - Bin continuous values into discrete categories based on their quantiles. - - Parameters - ---------- - quantiles - Either a list of quantile probabilities between 0 and 1 or a positive - integer determining the number of bins with uniform probability. - labels - Names of the categories. The number of labels must be equal to the number - of categories. - left_closed - Set the intervals to be left-closed instead of right-closed. - allow_duplicates - If set to `True`, duplicates in the resulting quantiles are dropped, - rather than raising a `DuplicateError`. This can happen even with unique - probabilities, depending on the data. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - - Returns - ------- - Expr - Expression of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise an expression of data type :class:`Struct`. - - See Also - -------- - cut - - Examples - -------- - Divide a column into three categories according to pre-defined quantile - probabilities. - - >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) - >>> df.with_columns( - ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") - ... ) - shape: (5, 2) - ┌─────┬──────┐ - │ foo ┆ qcut │ - │ --- ┆ --- │ - │ i64 ┆ cat │ - ╞═════╪══════╡ - │ -2 ┆ a │ - │ -1 ┆ a │ - │ 0 ┆ b │ - │ 1 ┆ b │ - │ 2 ┆ c │ - └─────┴──────┘ - - Divide a column into two categories using uniform quantile probabilities. - - >>> df.with_columns( - ... pl.col("foo") - ... .qcut(2, labels=["low", "high"], left_closed=True) - ... .alias("qcut") - ... ) - shape: (5, 2) - ┌─────┬──────┐ - │ foo ┆ qcut │ - │ --- ┆ --- │ - │ i64 ┆ cat │ - ╞═════╪══════╡ - │ -2 ┆ low │ - │ -1 ┆ low │ - │ 0 ┆ high │ - │ 1 ┆ high │ - │ 2 ┆ high │ - └─────┴──────┘ - - Add both the category and the breakpoint. - - >>> df.with_columns( - ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") - ... ).unnest("qcut") - shape: (5, 3) - ┌─────┬──────┬────────────┐ - │ foo ┆ brk ┆ foo_bin │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪══════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴──────┴────────────┘ - - ''' - def rle(self) -> Self: - ''' - Get the lengths of runs of identical values. - - Returns - ------- - Expr - Expression of data type :class:`Struct` with Fields "lengths" and "values". - - Examples - -------- - >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) - >>> df.select(pl.col("s").rle()).unnest("s") - shape: (6, 2) - ┌─────────┬────────┐ - │ lengths ┆ values │ - │ --- ┆ --- │ - │ i32 ┆ i64 │ - ╞═════════╪════════╡ - │ 2 ┆ 1 │ - │ 1 ┆ 2 │ - │ 1 ┆ 1 │ - │ 1 ┆ null │ - │ 1 ┆ 1 │ - │ 2 ┆ 3 │ - └─────────┴────────┘ - ''' - def rle_id(self) -> Self: - ''' - Map values to run IDs. - - Similar to RLE, but it maps each value to an ID corresponding to the run into - which it falls. This is especially useful when you want to define groups by - runs of identical values rather than the values themselves. - - - Examples - -------- - >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) - >>> # It works on structs of multiple values too! - >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) - shape: (5, 4) - ┌─────┬──────┬─────┬──────┐ - │ a ┆ b ┆ a_r ┆ ab_r │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ u32 ┆ u32 │ - ╞═════╪══════╪═════╪══════╡ - │ 1 ┆ x ┆ 0 ┆ 0 │ - │ 2 ┆ x ┆ 1 ┆ 1 │ - │ 1 ┆ null ┆ 2 ┆ 2 │ - │ 1 ┆ y ┆ 2 ┆ 3 │ - │ 1 ┆ y ┆ 2 ┆ 3 │ - └─────┴──────┴─────┴──────┘ - ''' - def filter(self, predicate: Expr) -> Self: - ''' - Filter a single column. - - The original order of the remaining elements is preserved. - - Mostly useful in an aggregation context. If you want to filter on a DataFrame - level, use `LazyFrame.filter`. - - Parameters - ---------- - predicate - Boolean expression. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group_col": ["g1", "g1", "g2"], - ... "b": [1, 2, 3], - ... } - ... ) - >>> df.group_by("group_col").agg( - ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), - ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), - ... ).sort("group_col") - shape: (2, 3) - ┌───────────┬─────┬─────┐ - │ group_col ┆ lt ┆ gte │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═══════════╪═════╪═════╡ - │ g1 ┆ 1 ┆ 2 │ - │ g2 ┆ 0 ┆ 3 │ - └───────────┴─────┴─────┘ - - ''' - def where(self, predicate: Expr) -> Self: - ''' - Filter a single column. - - Alias for :func:`filter`. - - Parameters - ---------- - predicate - Boolean expression. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group_col": ["g1", "g1", "g2"], - ... "b": [1, 2, 3], - ... } - ... ) - >>> df.group_by("group_col").agg( - ... [ - ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), - ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), - ... ] - ... ).sort("group_col") - shape: (2, 3) - ┌───────────┬─────┬─────┐ - │ group_col ┆ lt ┆ gte │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═══════════╪═════╪═════╡ - │ g1 ┆ 1 ┆ 2 │ - │ g2 ┆ 0 ┆ 3 │ - └───────────┴─────┴─────┘ - - ''' - def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: - ''' - Apply a custom python function to a whole Series or sequence of Series. - - The output of this custom function must be a Series. If you want to apply a - custom function elementwise over single values, see :func:`map_elements`. - A reasonable use case for `map` functions is transforming the values - represented by an expression using a third-party library. - - Read more in `the book - `_. - - Parameters - ---------- - function - Lambda/function to apply. - return_dtype - Dtype of the output Series. - agg_list - Aggregate list. - - Notes - ----- - If you are looking to map a function over a window function or group_by context, - refer to func:`map_elements` instead. - - Warnings - -------- - If `return_dtype` is not provided, this may lead to unexpected results. - We allow this, but it is considered a bug in the user\'s query. - - See Also - -------- - map_elements - replace - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "sine": [0.0, 1.0, 0.0, -1.0], - ... "cosine": [1.0, 0.0, -1.0, 0.0], - ... } - ... ) - >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) - shape: (1, 2) - ┌──────┬────────┐ - │ sine ┆ cosine │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪════════╡ - │ 1 ┆ 0 │ - └──────┴────────┘ - - ''' - def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - ''' - Map a custom/user-defined function (UDF) to each element of a column. - - .. warning:: - This method is much slower than the native expressions API. - Only use it if you cannot implement your logic otherwise. - - The UDF is applied to each element of a column. Note that, in a GroupBy - context, the column will have been pre-aggregated and so each element - will itself be a Series. Therefore, depending on the context, - requirements for `function` differ: - - * Selection - Expects `function` to be of type `Callable[[Any], Any]`. - Applies a Python function to each individual value in the column. - * GroupBy - Expects `function` to be of type `Callable[[Series], Any]`. - For each group, applies a Python function to the slice of the column - corresponding to that group. - - Parameters - ---------- - function - Lambda/function to map. - return_dtype - Dtype of the output Series. - If not set, the dtype will be `pl.Unknown`. - skip_nulls - Don\'t map the function over values that contain nulls (this is faster). - pass_name - Pass the Series name to the custom function (this is more expensive). - strategy : {\'thread_local\', \'threading\'} - This functionality is considered experimental and may be removed/changed. - - - \'thread_local\': run the python function on a single thread. - - \'threading\': run the python function on separate threads. Use with - care as this can slow performance. This might only speed up - your code if the amount of work per element is significant - and the python function releases the GIL (e.g. via calling - a c function) - - Notes - ----- - * Using `map_elements` is strongly discouraged as you will be effectively - running python "for" loops, which will be very slow. Wherever possible you - should prefer the native expression API to achieve the best performance. - - * If your function is expensive and you don\'t want it to be called more than - once for a given input, consider applying an `@lru_cache` decorator to it. - If your data is suitable you may achieve *significant* speedups. - - * Window function application using `over` is considered a GroupBy context - here, so `map_elements` can be used to map functions over window groups. - - Warnings - -------- - If `return_dtype` is not provided, this may lead to unexpected results. - We allow this, but it is considered a bug in the user\'s query. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["a", "b", "c", "c"], - ... } - ... ) - - The function is applied to each element of column `\'a\'`: - - >>> df.with_columns( # doctest: +SKIP - ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), - ... ) - shape: (4, 3) - ┌─────┬─────┬───────────┐ - │ a ┆ b ┆ a_times_2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 │ - ╞═════╪═════╪═══════════╡ - │ 1 ┆ a ┆ 2 │ - │ 2 ┆ b ┆ 4 │ - │ 3 ┆ c ┆ 6 │ - │ 1 ┆ c ┆ 2 │ - └─────┴─────┴───────────┘ - - Tip: it is better to implement this with an expression: - - >>> df.with_columns( - ... (pl.col("a") * 2).alias("a_times_2"), - ... ) # doctest: +IGNORE_RESULT - - In a GroupBy context, each element of the column is itself a Series: - - >>> ( - ... df.lazy().group_by("b").agg(pl.col("a")).collect() - ... ) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬───────────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════╪═══════════╡ - │ a ┆ [1] │ - │ b ┆ [2] │ - │ c ┆ [3, 1] │ - └─────┴───────────┘ - - Therefore, from the user\'s point-of-view, the function is applied per-group: - - >>> ( - ... df.lazy() - ... .group_by("b") - ... .agg(pl.col("a").map_elements(lambda x: x.sum())) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬─────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 1 │ - │ b ┆ 2 │ - │ c ┆ 4 │ - └─────┴─────┘ - - Tip: again, it is better to implement this with an expression: - - >>> ( - ... df.lazy() - ... .group_by("b", maintain_order=True) - ... .agg(pl.col("a").sum()) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - - Window function application using `over` will behave as a GroupBy - context, with your function receiving individual window groups: - - >>> df = pl.DataFrame( - ... { - ... "key": ["x", "x", "y", "x", "y", "z"], - ... "val": [1, 1, 1, 1, 1, 1], - ... } - ... ) - >>> df.with_columns( - ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), - ... ).sort("key") - shape: (6, 3) - ┌─────┬─────┬────────┐ - │ key ┆ val ┆ scaled │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪════════╡ - │ x ┆ 1 ┆ 3 │ - │ x ┆ 1 ┆ 3 │ - │ x ┆ 1 ┆ 3 │ - │ y ┆ 1 ┆ 2 │ - │ y ┆ 1 ┆ 2 │ - │ z ┆ 1 ┆ 1 │ - └─────┴─────┴────────┘ - - Note that this function would *also* be better-implemented natively: - - >>> df.with_columns( - ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), - ... ).sort( - ... "key" - ... ) # doctest: +IGNORE_RESULT - - ''' - def flatten(self) -> Self: - ''' - Flatten a list or string column. - - Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": ["a", "b", "b"], - ... "values": [[1, 2], [2, 3], [4]], - ... } - ... ) - >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ values │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪═══════════╡ - │ a ┆ [1, 2] │ - │ b ┆ [2, 3, 4] │ - └───────┴───────────┘ - - ''' - def explode(self) -> Self: - ''' - Explode a list expression. - - This means that every item is expanded to a new row. - - Returns - ------- - Expr - Expression with the data type of the list elements. - - See Also - -------- - Expr.list.explode : Explode a list column. - Expr.str.explode : Explode a string column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": ["a", "b"], - ... "values": [ - ... [1, 2], - ... [3, 4], - ... ], - ... } - ... ) - >>> df.select(pl.col("values").explode()) - shape: (4, 1) - ┌────────┐ - │ values │ - │ --- │ - │ i64 │ - ╞════════╡ - │ 1 │ - │ 2 │ - │ 3 │ - │ 4 │ - └────────┘ - - ''' - def implode(self) -> Self: - ''' - Aggregate values into a list. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": [4, 5, 6], - ... } - ... ) - >>> df.select(pl.all().implode()) - shape: (1, 2) - ┌───────────┬───────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ list[i64] ┆ list[i64] │ - ╞═══════════╪═══════════╡ - │ [1, 2, 3] ┆ [4, 5, 6] │ - └───────────┴───────────┘ - - ''' - def gather_every(self, n: int) -> Self: - ''' - Take every nth value in the Series and return as a new Series. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) - >>> df.select(pl.col("foo").gather_every(3)) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 4 │ - │ 7 │ - └─────┘ - - ''' - def head(self, n: int | Expr = ...) -> Self: - ''' - Get the first `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.head(3) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - ''' - def tail(self, n: int | Expr = ...) -> Self: - ''' - Get the last `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.tail(3) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 5 │ - │ 6 │ - │ 7 │ - └─────┘ - - ''' - def limit(self, n: int | Expr = ...) -> Self: - ''' - Get the first `n` rows (alias for :func:`Expr.head`). - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.limit(3) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - ''' - def and_(self, *others: Any) -> Self: - ''' - Method equivalent of bitwise "and" operator `expr & other & ...`. - - Parameters - ---------- - *others - One or more integer or boolean expressions to evaluate/combine. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5, 6, 7, 4, 8], - ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], - ... "z": [-9, 2, -1, 4, 8], - ... } - ... ) - >>> df.select( - ... (pl.col("x") >= pl.col("z")) - ... .and_( - ... pl.col("y") >= pl.col("z"), - ... pl.col("y") == pl.col("y"), - ... pl.col("z") <= pl.col("x"), - ... pl.col("y") != pl.col("x"), - ... ) - ... .alias("all") - ... ) - shape: (5, 1) - ┌───────┐ - │ all │ - │ --- │ - │ bool │ - ╞═══════╡ - │ true │ - │ true │ - │ true │ - │ false │ - │ false │ - └───────┘ - - ''' - def or_(self, *others: Any) -> Self: - ''' - Method equivalent of bitwise "or" operator `expr | other | ...`. - - Parameters - ---------- - *others - One or more integer or boolean expressions to evaluate/combine. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5, 6, 7, 4, 8], - ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], - ... "z": [-9, 2, -1, 4, 8], - ... } - ... ) - >>> df.select( - ... (pl.col("x") == pl.col("y")) - ... .or_( - ... pl.col("x") == pl.col("y"), - ... pl.col("y") == pl.col("z"), - ... pl.col("y").cast(int) == pl.col("z"), - ... ) - ... .alias("any") - ... ) - shape: (5, 1) - ┌───────┐ - │ any │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ true │ - │ false │ - │ true │ - │ false │ - └───────┘ - - ''' - def eq(self, other: Any) -> Self: - ''' - Method equivalent of equality operator `expr == other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0], - ... "y": [2.0, 2.0, float("nan"), 4.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").eq(pl.col("y")).alias("x == y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x == y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 1.0 ┆ 2.0 ┆ false │ - │ 2.0 ┆ 2.0 ┆ true │ - │ NaN ┆ NaN ┆ false │ - │ 4.0 ┆ 4.0 ┆ true │ - └─────┴─────┴────────┘ - - ''' - def eq_missing(self, other: Any) -> Self: - ''' - Method equivalent of equality operator `expr == other` where `None == None`. - - This differs from default `eq` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], - ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").eq(pl.col("y")).alias("x eq y"), - ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), - ... ) - shape: (6, 4) - ┌──────┬──────┬────────┬────────────────┐ - │ x ┆ y ┆ x eq y ┆ x eq_missing y │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪══════╪════════╪════════════════╡ - │ 1.0 ┆ 2.0 ┆ false ┆ false │ - │ 2.0 ┆ 2.0 ┆ true ┆ true │ - │ NaN ┆ NaN ┆ false ┆ false │ - │ 4.0 ┆ 4.0 ┆ true ┆ true │ - │ null ┆ 5.0 ┆ null ┆ false │ - │ null ┆ null ┆ null ┆ true │ - └──────┴──────┴────────┴────────────────┘ - - ''' - def ge(self, other: Any) -> Self: - ''' - Method equivalent of "greater than or equal" operator `expr >= other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5.0, 4.0, float("nan"), 2.0], - ... "y": [5.0, 3.0, float("nan"), 1.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").ge(pl.col("y")).alias("x >= y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x >= y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 5.0 ┆ 5.0 ┆ true │ - │ 4.0 ┆ 3.0 ┆ true │ - │ NaN ┆ NaN ┆ false │ - │ 2.0 ┆ 1.0 ┆ true │ - └─────┴─────┴────────┘ - - ''' - def gt(self, other: Any) -> Self: - ''' - Method equivalent of "greater than" operator `expr > other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5.0, 4.0, float("nan"), 2.0], - ... "y": [5.0, 3.0, float("nan"), 1.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").gt(pl.col("y")).alias("x > y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬───────┐ - │ x ┆ y ┆ x > y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪═══════╡ - │ 5.0 ┆ 5.0 ┆ false │ - │ 4.0 ┆ 3.0 ┆ true │ - │ NaN ┆ NaN ┆ false │ - │ 2.0 ┆ 1.0 ┆ true │ - └─────┴─────┴───────┘ - - ''' - def le(self, other: Any) -> Self: - ''' - Method equivalent of "less than or equal" operator `expr <= other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5.0, 4.0, float("nan"), 0.5], - ... "y": [5.0, 3.5, float("nan"), 2.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").le(pl.col("y")).alias("x <= y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x <= y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 5.0 ┆ 5.0 ┆ true │ - │ 4.0 ┆ 3.5 ┆ false │ - │ NaN ┆ NaN ┆ false │ - │ 0.5 ┆ 2.0 ┆ true │ - └─────┴─────┴────────┘ - - ''' - def lt(self, other: Any) -> Self: - ''' - Method equivalent of "less than" operator `expr < other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 3.0], - ... "y": [2.0, 2.0, float("nan"), 4.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").lt(pl.col("y")).alias("x < y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬───────┐ - │ x ┆ y ┆ x < y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪═══════╡ - │ 1.0 ┆ 2.0 ┆ true │ - │ 2.0 ┆ 2.0 ┆ false │ - │ NaN ┆ NaN ┆ false │ - │ 3.0 ┆ 4.0 ┆ true │ - └─────┴─────┴───────┘ - - ''' - def ne(self, other: Any) -> Self: - ''' - Method equivalent of inequality operator `expr != other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0], - ... "y": [2.0, 2.0, float("nan"), 4.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").ne(pl.col("y")).alias("x != y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x != y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 1.0 ┆ 2.0 ┆ true │ - │ 2.0 ┆ 2.0 ┆ false │ - │ NaN ┆ NaN ┆ true │ - │ 4.0 ┆ 4.0 ┆ false │ - └─────┴─────┴────────┘ - - ''' - def ne_missing(self, other: Any) -> Self: - ''' - Method equivalent of equality operator `expr != other` where `None == None`. - - This differs from default `ne` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], - ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").ne(pl.col("y")).alias("x ne y"), - ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), - ... ) - shape: (6, 4) - ┌──────┬──────┬────────┬────────────────┐ - │ x ┆ y ┆ x ne y ┆ x ne_missing y │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪══════╪════════╪════════════════╡ - │ 1.0 ┆ 2.0 ┆ true ┆ true │ - │ 2.0 ┆ 2.0 ┆ false ┆ false │ - │ NaN ┆ NaN ┆ true ┆ true │ - │ 4.0 ┆ 4.0 ┆ false ┆ false │ - │ null ┆ 5.0 ┆ null ┆ true │ - │ null ┆ null ┆ null ┆ false │ - └──────┴──────┴────────┴────────────────┘ - - ''' - def add(self, other: Any) -> Self: - ''' - Method equivalent of addition operator `expr + other`. - - Parameters - ---------- - other - numeric or string value; accepts expression input. - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) - >>> df.with_columns( - ... pl.col("x").add(2).alias("x+int"), - ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), - ... ) - shape: (5, 3) - ┌─────┬───────┬────────┐ - │ x ┆ x+int ┆ x+expr │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═══════╪════════╡ - │ 1 ┆ 3 ┆ 2 │ - │ 2 ┆ 4 ┆ 4 │ - │ 3 ┆ 5 ┆ 9 │ - │ 4 ┆ 6 ┆ 28 │ - │ 5 ┆ 7 ┆ 125 │ - └─────┴───────┴────────┘ - - >>> df = pl.DataFrame( - ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} - ... ) - >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) - shape: (3, 4) - ┌─────┬─────┬─────┬─────┐ - │ x ┆ y ┆ z ┆ xyz │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ str ┆ str │ - ╞═════╪═════╪═════╪═════╡ - │ a ┆ b ┆ c ┆ abc │ - │ d ┆ e ┆ f ┆ def │ - │ g ┆ h ┆ i ┆ ghi │ - └─────┴─────┴─────┴─────┘ - - ''' - def floordiv(self, other: Any) -> Self: - ''' - Method equivalent of integer division operator `expr // other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - See Also - -------- - truediv - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) - >>> df.with_columns( - ... pl.col("x").truediv(2).alias("x/2"), - ... pl.col("x").floordiv(2).alias("x//2"), - ... ) - shape: (5, 3) - ┌─────┬─────┬──────┐ - │ x ┆ x/2 ┆ x//2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ i64 │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 0.5 ┆ 0 │ - │ 2 ┆ 1.0 ┆ 1 │ - │ 3 ┆ 1.5 ┆ 1 │ - │ 4 ┆ 2.0 ┆ 2 │ - │ 5 ┆ 2.5 ┆ 2 │ - └─────┴─────┴──────┘ - - ''' - def mod(self, other: Any) -> Self: - ''' - Method equivalent of modulus operator `expr % other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) - >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) - shape: (5, 2) - ┌─────┬─────┐ - │ x ┆ x%2 │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 0 ┆ 0 │ - │ 1 ┆ 1 │ - │ 2 ┆ 0 │ - │ 3 ┆ 1 │ - │ 4 ┆ 0 │ - └─────┴─────┘ - - ''' - def mul(self, other: Any) -> Self: - ''' - Method equivalent of multiplication operator `expr * other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) - >>> df.with_columns( - ... pl.col("x").mul(2).alias("x*2"), - ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), - ... ) - shape: (5, 3) - ┌─────┬─────┬───────────┐ - │ x ┆ x*2 ┆ x * xlog2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ f64 │ - ╞═════╪═════╪═══════════╡ - │ 1 ┆ 2 ┆ 0.0 │ - │ 2 ┆ 4 ┆ 2.0 │ - │ 4 ┆ 8 ┆ 8.0 │ - │ 8 ┆ 16 ┆ 24.0 │ - │ 16 ┆ 32 ┆ 64.0 │ - └─────┴─────┴───────────┘ - - ''' - def sub(self, other: Any) -> Self: - ''' - Method equivalent of subtraction operator `expr - other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("x").sub(2).alias("x-2"), - ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), - ... ) - shape: (5, 3) - ┌─────┬─────┬────────┐ - │ x ┆ x-2 ┆ x-expr │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪════════╡ - │ 0 ┆ -2 ┆ 0 │ - │ 1 ┆ -1 ┆ 0 │ - │ 2 ┆ 0 ┆ -1 │ - │ 3 ┆ 1 ┆ -3 │ - │ 4 ┆ 2 ┆ -6 │ - └─────┴─────┴────────┘ - - ''' - def truediv(self, other: Any) -> Self: - ''' - Method equivalent of float division operator `expr / other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Notes - ----- - Zero-division behaviour follows IEEE-754: - - 0/0: Invalid operation - mathematically undefined, returns NaN. - n/0: On finite operands gives an exact infinite result, eg: ±infinity. - - See Also - -------- - floordiv - - Examples - -------- - >>> df = pl.DataFrame( - ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} - ... ) - >>> df.with_columns( - ... pl.col("x").truediv(2).alias("x/2"), - ... pl.col("x").truediv(pl.col("y")).alias("x/y"), - ... ) - shape: (5, 4) - ┌─────┬──────┬──────┬───────┐ - │ x ┆ y ┆ x/2 ┆ x/y │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ f64 ┆ f64 │ - ╞═════╪══════╪══════╪═══════╡ - │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ - │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ - │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ - │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ - │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ - └─────┴──────┴──────┴───────┘ - - ''' - def pow(self, exponent: int | float | None | Series | Expr) -> Self: - ''' - Method equivalent of exponentiation operator `expr ** exponent`. - - Parameters - ---------- - exponent - Numeric literal or expression exponent value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) - >>> df.with_columns( - ... pl.col("x").pow(3).alias("cube"), - ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), - ... ) - shape: (4, 3) - ┌─────┬───────┬────────────┐ - │ x ┆ cube ┆ x ** xlog2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ f64 │ - ╞═════╪═══════╪════════════╡ - │ 1 ┆ 1.0 ┆ 1.0 │ - │ 2 ┆ 8.0 ┆ 2.0 │ - │ 4 ┆ 64.0 ┆ 16.0 │ - │ 8 ┆ 512.0 ┆ 512.0 │ - └─────┴───────┴────────────┘ - - ''' - def xor(self, other: Any) -> Self: - ''' - Method equivalent of bitwise exclusive-or operator `expr ^ other`. - - Parameters - ---------- - other - Integer or boolean value; accepts expression input. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"x": [True, False, True, False], "y": [True, True, False, False]} - ... ) - >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) - shape: (4, 3) - ┌───────┬───────┬───────┐ - │ x ┆ y ┆ x ^ y │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞═══════╪═══════╪═══════╡ - │ true ┆ true ┆ false │ - │ false ┆ true ┆ true │ - │ true ┆ false ┆ true │ - │ false ┆ false ┆ false │ - └───────┴───────┴───────┘ - - >>> def binary_string(n: int) -> str: - ... return bin(n)[2:].zfill(8) - >>> - >>> df = pl.DataFrame( - ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, - ... schema={"x": pl.UInt8, "y": pl.UInt8}, - ... ) - >>> df.with_columns( - ... pl.col("x").map_elements(binary_string).alias("bin_x"), - ... pl.col("y").map_elements(binary_string).alias("bin_y"), - ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), - ... pl.col("x") - ... .xor(pl.col("y")) - ... .map_elements(binary_string) - ... .alias("bin_xor_xy"), - ... ) - shape: (4, 6) - ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ - │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ - ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ - │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ - │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ - │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ - │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ - └─────┴─────┴──────────┴──────────┴────────┴────────────┘ - - ''' - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: - ''' - Check if elements of this expression are present in the other Series. - - Parameters - ---------- - other - Series or sequence of primitive type. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} - ... ) - >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) - shape: (3, 3) - ┌───────────┬──────────────────┬──────────┐ - │ sets ┆ optional_members ┆ contains │ - │ --- ┆ --- ┆ --- │ - │ list[i64] ┆ i64 ┆ bool │ - ╞═══════════╪══════════════════╪══════════╡ - │ [1, 2, 3] ┆ 1 ┆ true │ - │ [1, 2] ┆ 2 ┆ true │ - │ [9, 10] ┆ 3 ┆ false │ - └───────────┴──────────────────┴──────────┘ - - ''' - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: - ''' - Repeat the elements in this Series as specified in the given expression. - - The repeated elements are expanded into a `List`. - - Parameters - ---------- - by - Numeric column that determines how often the values will be repeated. - The column will be coerced to UInt32. Give this dtype to make the coercion a - no-op. - - Returns - ------- - Expr - Expression of data type :class:`List`, where the inner data type is equal - to the original data type. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["x", "y", "z"], - ... "n": [1, 2, 3], - ... } - ... ) - >>> df.select(pl.col("a").repeat_by("n")) - shape: (3, 1) - ┌─────────────────┐ - │ a │ - │ --- │ - │ list[str] │ - ╞═════════════════╡ - │ ["x"] │ - │ ["y", "y"] │ - │ ["z", "z", "z"] │ - └─────────────────┘ - - ''' - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: - ''' - Check if this expression is between the given start and end values. - - Parameters - ---------- - lower_bound - Lower bound value. Accepts expression input. Strings are parsed as column - names, other non-expression inputs are parsed as literals. - upper_bound - Upper bound value. Accepts expression input. Strings are parsed as column - names, other non-expression inputs are parsed as literals. - closed : {\'both\', \'left\', \'right\', \'none\'} - Define which sides of the interval are closed (inclusive). - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) - >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) - shape: (5, 2) - ┌─────┬────────────┐ - │ num ┆ is_between │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪════════════╡ - │ 1 ┆ false │ - │ 2 ┆ true │ - │ 3 ┆ true │ - │ 4 ┆ true │ - │ 5 ┆ false │ - └─────┴────────────┘ - - Use the `closed` argument to include or exclude the values at the bounds: - - >>> df.with_columns( - ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") - ... ) - shape: (5, 2) - ┌─────┬────────────┐ - │ num ┆ is_between │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪════════════╡ - │ 1 ┆ false │ - │ 2 ┆ true │ - │ 3 ┆ true │ - │ 4 ┆ false │ - │ 5 ┆ false │ - └─────┴────────────┘ - - You can also use strings as well as numeric/temporal values (note: ensure that - string literals are wrapped with `lit` so as not to conflate them with - column names): - - >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) - >>> df.with_columns( - ... pl.col("a") - ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") - ... .alias("is_between") - ... ) - shape: (5, 2) - ┌─────┬────────────┐ - │ a ┆ is_between │ - │ --- ┆ --- │ - │ str ┆ bool │ - ╞═════╪════════════╡ - │ a ┆ true │ - │ b ┆ true │ - │ c ┆ true │ - │ d ┆ false │ - │ e ┆ false │ - └─────┴────────────┘ - - ''' - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: - ''' - Hash the elements in the selection. - - The hash value is of type `UInt64`. - - Parameters - ---------- - seed - Random seed parameter. Defaults to 0. - seed_1 - Random seed parameter. Defaults to `seed` if not set. - seed_2 - Random seed parameter. Defaults to `seed` if not set. - seed_3 - Random seed parameter. Defaults to `seed` if not set. - - Notes - ----- - This implementation of :func:`rows` does not guarantee stable results - across different Polars versions. Its stability is only guaranteed within a - single version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": ["x", None, "z"], - ... } - ... ) - >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌──────────────────────┬──────────────────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u64 ┆ u64 │ - ╞══════════════════════╪══════════════════════╡ - │ 9774092659964970114 ┆ 13614470193936745724 │ - │ 1101441246220388612 ┆ 11638928888656214026 │ - │ 11638928888656214026 ┆ 13382926553367784577 │ - └──────────────────────┴──────────────────────┘ - - ''' - def reinterpret(self) -> Self: - ''' - Reinterpret the underlying bits as a signed/unsigned integer. - - This operation is only allowed for 64bit integers. For lower bits integers, - you can safely use that cast operation. - - Parameters - ---------- - signed - If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. - - Examples - -------- - >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) - >>> df = pl.DataFrame([s]) - >>> df.select( - ... [ - ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), - ... pl.col("a").alias("original"), - ... ] - ... ) - shape: (3, 2) - ┌───────────────┬──────────┐ - │ reinterpreted ┆ original │ - │ --- ┆ --- │ - │ i64 ┆ u64 │ - ╞═══════════════╪══════════╡ - │ 1 ┆ 1 │ - │ 1 ┆ 1 │ - │ 2 ┆ 2 │ - └───────────────┴──────────┘ - - ''' - def inspect(self, fmt: str = ...) -> Self: - ''' - Print the value that this expression evaluates to and pass on the value. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 1, 2]}) - >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) - value is: shape: (3,) - Series: \'foo\' [i64] - [ - 1 - 2 - 4 - ] - shape: (3, 1) - ┌─────┐ - │ bar │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 4 │ - └─────┘ - - ''' - def interpolate(self, method: InterpolationMethod = ...) -> Self: - ''' - Fill null values using interpolation. - - Parameters - ---------- - method : {\'linear\', \'nearest\'} - Interpolation method. - - Examples - -------- - Fill null values using linear interpolation. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, None, 3], - ... "b": [1.0, float("nan"), 3.0], - ... } - ... ) - >>> df.select(pl.all().interpolate()) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 1.0 ┆ 1.0 │ - │ 2.0 ┆ NaN │ - │ 3.0 ┆ 3.0 │ - └─────┴─────┘ - - Fill null values using nearest interpolation. - - >>> df.select(pl.all().interpolate("nearest")) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪═════╡ - │ 1 ┆ 1.0 │ - │ 3 ┆ NaN │ - │ 3 ┆ 3.0 │ - └─────┴─────┘ - - Regrid data to a new grid. - - >>> df_original_grid = pl.DataFrame( - ... { - ... "grid_points": [1, 3, 10], - ... "values": [2.0, 6.0, 20.0], - ... } - ... ) # Interpolate from this to the new grid - >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) - >>> df_new_grid.join( - ... df_original_grid, on="grid_points", how="left" - ... ).with_columns(pl.col("values").interpolate()) - shape: (10, 2) - ┌─────────────┬────────┐ - │ grid_points ┆ values │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════════════╪════════╡ - │ 1 ┆ 2.0 │ - │ 2 ┆ 4.0 │ - │ 3 ┆ 6.0 │ - │ 4 ┆ 8.0 │ - │ … ┆ … │ - │ 7 ┆ 14.0 │ - │ 8 ┆ 16.0 │ - │ 9 ┆ 18.0 │ - │ 10 ┆ 20.0 │ - └─────────────┴────────┘ - - ''' - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling min (moving min) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their min. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic - temporal size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_min=pl.col("A").rolling_min(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_min │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 2.0 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ 4.0 │ - │ 6.0 ┆ 5.0 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_min=pl.col("A").rolling_min( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_min │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.25 │ - │ 3.0 ┆ 0.5 │ - │ 4.0 ┆ 0.75 │ - │ 5.0 ┆ 1.0 │ - │ 6.0 ┆ 1.25 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_min │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 2.0 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ 4.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - >>> df_temporal.with_columns( - ... rolling_row_min=pl.col("row_nr").rolling_min( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_min │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling max (moving max) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their max. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_max=pl.col("A").rolling_max(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_max │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 2.0 │ - │ 3.0 ┆ 3.0 │ - │ 4.0 ┆ 4.0 │ - │ 5.0 ┆ 5.0 │ - │ 6.0 ┆ 6.0 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_max=pl.col("A").rolling_max( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_max │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.25 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ 3.75 │ - │ 6.0 ┆ 4.5 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_max │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 3.0 │ - │ 3.0 ┆ 4.0 │ - │ 4.0 ┆ 5.0 │ - │ 5.0 ┆ 6.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling max with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_max=pl.col("row_nr").rolling_max( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_max │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling max with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_max=pl.col("row_nr").rolling_max( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_max │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling mean (moving mean) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their mean. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_mean=pl.col("A").rolling_mean(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬──────────────┐ - │ A ┆ rolling_mean │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.5 │ - │ 4.0 ┆ 3.5 │ - │ 5.0 ┆ 4.5 │ - │ 6.0 ┆ 5.5 │ - └─────┴──────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_mean=pl.col("A").rolling_mean( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────┐ - │ A ┆ rolling_mean │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.75 │ - │ 3.0 ┆ 2.75 │ - │ 4.0 ┆ 3.75 │ - │ 5.0 ┆ 4.75 │ - │ 6.0 ┆ 5.75 │ - └─────┴──────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬──────────────┐ - │ A ┆ rolling_mean │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 2.0 │ - │ 3.0 ┆ 3.0 │ - │ 4.0 ┆ 4.0 │ - │ 5.0 ┆ 5.0 │ - │ 6.0 ┆ null │ - └─────┴──────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling mean with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_mean=pl.col("row_nr").rolling_mean( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬──────────────────┐ - │ row_nr ┆ date ┆ rolling_row_mean │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪══════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ - └────────┴─────────────────────┴──────────────────┘ - - Compute the rolling mean with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_mean=pl.col("row_nr").rolling_mean( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬──────────────────┐ - │ row_nr ┆ date ┆ rolling_row_mean │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪══════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ - └────────┴─────────────────────┴──────────────────┘ - - ''' - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling sum (moving sum) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their sum. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - of dtype `{Date, Datetime}` - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_sum=pl.col("A").rolling_sum(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_sum │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 3.0 │ - │ 3.0 ┆ 5.0 │ - │ 4.0 ┆ 7.0 │ - │ 5.0 ┆ 9.0 │ - │ 6.0 ┆ 11.0 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_sum=pl.col("A").rolling_sum( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_sum │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.75 │ - │ 3.0 ┆ 2.75 │ - │ 4.0 ┆ 3.75 │ - │ 5.0 ┆ 4.75 │ - │ 6.0 ┆ 5.75 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_sum │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 6.0 │ - │ 3.0 ┆ 9.0 │ - │ 4.0 ┆ 12.0 │ - │ 5.0 ┆ 15.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling sum with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_sum=pl.col("row_nr").rolling_sum( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_sum │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling sum with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_sum=pl.col("row_nr").rolling_sum( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_sum │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling standard deviation. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` means - the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_std=pl.col("A").rolling_std(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_std │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.707107 │ - │ 3.0 ┆ 0.707107 │ - │ 4.0 ┆ 0.707107 │ - │ 5.0 ┆ 0.707107 │ - │ 6.0 ┆ 0.707107 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_std=pl.col("A").rolling_std( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_std │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.433013 │ - │ 3.0 ┆ 0.433013 │ - │ 4.0 ┆ 0.433013 │ - │ 5.0 ┆ 0.433013 │ - │ 6.0 ┆ 0.433013 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_std │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 1.0 │ - │ 4.0 ┆ 1.0 │ - │ 5.0 ┆ 1.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling std with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_std=pl.col("row_nr").rolling_std( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_std │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling std with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_std=pl.col("row_nr").rolling_std( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_std │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling variance. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_var=pl.col("A").rolling_var(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_var │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.5 │ - │ 3.0 ┆ 0.5 │ - │ 4.0 ┆ 0.5 │ - │ 5.0 ┆ 0.5 │ - │ 6.0 ┆ 0.5 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_var=pl.col("A").rolling_var( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_var │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.1875 │ - │ 3.0 ┆ 0.1875 │ - │ 4.0 ┆ 0.1875 │ - │ 5.0 ┆ 0.1875 │ - │ 6.0 ┆ 0.1875 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_var │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 1.0 │ - │ 4.0 ┆ 1.0 │ - │ 5.0 ┆ 1.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling var with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_var=pl.col("row_nr").rolling_var( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_var │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling var with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_var=pl.col("row_nr").rolling_var( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_var │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling median. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` means - the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_median=pl.col("A").rolling_median(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬────────────────┐ - │ A ┆ rolling_median │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.5 │ - │ 4.0 ┆ 3.5 │ - │ 5.0 ┆ 4.5 │ - │ 6.0 ┆ 5.5 │ - └─────┴────────────────┘ - - Specify weights for the values in each window: - - >>> df.with_columns( - ... rolling_median=pl.col("A").rolling_median( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬────────────────┐ - │ A ┆ rolling_median │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.5 │ - │ 4.0 ┆ 3.5 │ - │ 5.0 ┆ 4.5 │ - │ 6.0 ┆ 5.5 │ - └─────┴────────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬────────────────┐ - │ A ┆ rolling_median │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 2.0 │ - │ 3.0 ┆ 3.0 │ - │ 4.0 ┆ 4.0 │ - │ 5.0 ┆ 5.0 │ - │ 6.0 ┆ null │ - └─────┴────────────────┘ - - ''' - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling quantile. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - window_size - The length of the window. Can be a fixed integer size, or a dynamic - temporal size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.25, window_size=4 - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ null │ - │ 4.0 ┆ 2.0 │ - │ 5.0 ┆ 3.0 │ - │ 6.0 ┆ 4.0 │ - └─────┴──────────────────┘ - - Specify weights for the values in each window: - - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ null │ - │ 4.0 ┆ 2.0 │ - │ 5.0 ┆ 3.0 │ - │ 6.0 ┆ 4.0 │ - └─────┴──────────────────┘ - - Specify weights and interpolation method - - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.25, - ... window_size=4, - ... weights=[0.2, 0.4, 0.4, 0.2], - ... interpolation="linear", - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ null │ - │ 4.0 ┆ 1.625 │ - │ 5.0 ┆ 2.625 │ - │ 6.0 ┆ 3.625 │ - └─────┴──────────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.2, window_size=5, center=True - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ 2.0 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ null │ - │ 6.0 ┆ null │ - └─────┴──────────────────┘ - - ''' - def rolling_skew(self, window_size: int) -> Self: - ''' - Compute a rolling skew. - - The window at a given row includes the row itself and the - `window_size - 1` elements before it. - - Parameters - ---------- - window_size - Integer size of the rolling window. - bias - If False, the calculations are corrected for statistical bias. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) - >>> df.select(pl.col("a").rolling_skew(3)) - shape: (4, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ null │ - │ null │ - │ 0.381802 │ - │ 0.47033 │ - └──────────┘ - - Note how the values match the following: - - >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() - (0.38180177416060584, 0.47033046033698594) - - ''' - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a custom rolling window function. - - .. warning:: - Computing custom functions is extremely slow. Use specialized rolling - functions such as :func:`Expr.rolling_sum` if at all possible. - - Parameters - ---------- - function - Custom aggregation function. - window_size - Size of the window. The window at a given row will include the row - itself and the `window_size - 1` elements before it. - weights - A list of weights with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window. - - Examples - -------- - >>> from numpy import nansum - >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) - >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) - shape: (5, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ null │ - │ null │ - │ 22.0 │ - │ 11.0 │ - │ 17.0 │ - └──────┘ - - ''' - def abs(self) -> Self: - ''' - Compute absolute values. - - Same as `abs(expr)`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [-1.0, 0.0, 1.0, 2.0], - ... } - ... ) - >>> df.select(pl.col("A").abs()) - shape: (4, 1) - ┌─────┐ - │ A │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - │ 0.0 │ - │ 1.0 │ - │ 2.0 │ - └─────┘ - - ''' - def rank(self, method: RankMethod = ...) -> Self: - ''' - Assign ranks to data, dealing with ties appropriately. - - Parameters - ---------- - method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} - The method used to assign ranks to tied elements. - The following methods are available (default is \'average\'): - - - \'average\' : The average of the ranks that would have been assigned to - all the tied values is assigned to each value. - - \'min\' : The minimum of the ranks that would have been assigned to all - the tied values is assigned to each value. (This is also referred to - as "competition" ranking.) - - \'max\' : The maximum of the ranks that would have been assigned to all - the tied values is assigned to each value. - - \'dense\' : Like \'min\', but the rank of the next highest element is - assigned the rank immediately after those assigned to the tied - elements. - - \'ordinal\' : All values are given a distinct rank, corresponding to - the order that the values occur in the Series. - - \'random\' : Like \'ordinal\', but the rank for ties is not dependent - on the order that the values occur in the Series. - descending - Rank in descending order. - seed - If `method="random"`, use this as seed. - - Examples - -------- - The \'average\' method: - - >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) - >>> df.select(pl.col("a").rank()) - shape: (5, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 3.0 │ - │ 4.5 │ - │ 1.5 │ - │ 1.5 │ - │ 4.5 │ - └─────┘ - - The \'ordinal\' method: - - >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) - >>> df.select(pl.col("a").rank("ordinal")) - shape: (5, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 3 │ - │ 4 │ - │ 1 │ - │ 2 │ - │ 5 │ - └─────┘ - - Use \'rank\' with \'over\' to rank within groups: - - >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) - >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) - shape: (5, 3) - ┌─────┬─────┬──────┐ - │ a ┆ b ┆ rank │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ f64 │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 6 ┆ 1.0 │ - │ 1 ┆ 7 ┆ 2.0 │ - │ 2 ┆ 5 ┆ 1.0 │ - │ 2 ┆ 14 ┆ 3.0 │ - │ 2 ┆ 11 ┆ 2.0 │ - └─────┴─────┴──────┘ - - ''' - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: - ''' - Calculate the first discrete difference between shifted items. - - Parameters - ---------- - n - Number of slots to shift. - null_behavior : {\'ignore\', \'drop\'} - How to handle null values. - - Examples - -------- - >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) - >>> df.with_columns(change=pl.col("int").diff()) - shape: (5, 2) - ┌─────┬────────┐ - │ int ┆ change │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪════════╡ - │ 20 ┆ null │ - │ 10 ┆ -10 │ - │ 30 ┆ 20 │ - │ 25 ┆ -5 │ - │ 35 ┆ 10 │ - └─────┴────────┘ - - >>> df.with_columns(change=pl.col("int").diff(n=2)) - shape: (5, 2) - ┌─────┬────────┐ - │ int ┆ change │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪════════╡ - │ 20 ┆ null │ - │ 10 ┆ null │ - │ 30 ┆ 10 │ - │ 25 ┆ 15 │ - │ 35 ┆ 5 │ - └─────┴────────┘ - - >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) - shape: (3, 1) - ┌──────┐ - │ diff │ - │ --- │ - │ i64 │ - ╞══════╡ - │ 10 │ - │ 15 │ - │ 5 │ - └──────┘ - - ''' - def pct_change(self, n: int | IntoExprColumn = ...) -> Self: - ''' - Computes percentage change between values. - - Percentage change (as fraction) between current element and most-recent - non-null element at least `n` period(s) before the current element. - - Computes the change from the previous row by default. - - Parameters - ---------- - n - periods to shift for forming percent change. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [10, 11, 12, None, 12], - ... } - ... ) - >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) - shape: (5, 2) - ┌──────┬────────────┐ - │ a ┆ pct_change │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞══════╪════════════╡ - │ 10 ┆ null │ - │ 11 ┆ 0.1 │ - │ 12 ┆ 0.090909 │ - │ null ┆ 0.0 │ - │ 12 ┆ 0.0 │ - └──────┴────────────┘ - - ''' - def skew(self) -> Self: - ''' - Compute the sample skewness of a data set. - - For normally distributed data, the skewness should be about zero. For - unimodal continuous distributions, a skewness value greater than zero means - that there is more weight in the right tail of the distribution. The - function `skewtest` can be used to determine if the skewness value - is close enough to zero, statistically speaking. - - - See scipy.stats for more information. - - Parameters - ---------- - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - Notes - ----- - The sample skewness is computed as the Fisher-Pearson coefficient - of skewness, i.e. - - .. math:: g_1=\\frac{m_3}{m_2^{3/2}} - - where - - .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i - - is the biased sample :math:`i\\texttt{th}` central moment, and - :math:`\\bar{x}` is - the sample mean. If `bias` is False, the calculations are - corrected for bias and the value computed is the adjusted - Fisher-Pearson standardized moment coefficient, i.e. - - .. math:: - G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").skew()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.343622 │ - └──────────┘ - - ''' - def kurtosis(self) -> Self: - ''' - Compute the kurtosis (Fisher or Pearson) of a dataset. - - Kurtosis is the fourth central moment divided by the square of the - variance. If Fisher\'s definition is used, then 3.0 is subtracted from - the result to give 0.0 for a normal distribution. - If bias is False then the kurtosis is calculated using k statistics to - eliminate bias coming from biased moment estimators. - - See scipy.stats for more information - - Parameters - ---------- - fisher : bool, optional - If True, Fisher\'s definition is used (normal ==> 0.0). If False, - Pearson\'s definition is used (normal ==> 3.0). - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").kurtosis()) - shape: (1, 1) - ┌───────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═══════════╡ - │ -1.153061 │ - └───────────┘ - - ''' - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: - ''' - Set values outside the given boundaries to the boundary value. - - Parameters - ---------- - lower_bound - Lower bound. Accepts expression input. - Non-expression inputs are parsed as literals. - upper_bound - Upper bound. Accepts expression input. - Non-expression inputs are parsed as literals. - - See Also - -------- - when - - Notes - ----- - This method only works for numeric and temporal columns. To clip other data - types, consider writing a `when-then-otherwise` expression. See :func:`when`. - - Examples - -------- - Specifying both a lower and upper bound: - - >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) - >>> df.with_columns(clip=pl.col("a").clip(1, 10)) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ clip │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ -50 ┆ 1 │ - │ 5 ┆ 5 │ - │ 50 ┆ 10 │ - │ null ┆ null │ - └──────┴──────┘ - - Specifying only a single bound: - - >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ clip │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ -50 ┆ -50 │ - │ 5 ┆ 5 │ - │ 50 ┆ 10 │ - │ null ┆ null │ - └──────┴──────┘ - - ''' - def lower_bound(self) -> Self: - ''' - Calculate the lower bound. - - Returns a unit Series with the lowest value possible for the dtype of this - expression. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").lower_bound()) - shape: (1, 1) - ┌──────────────────────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════════════════════╡ - │ -9223372036854775808 │ - └──────────────────────┘ - - ''' - def upper_bound(self) -> Self: - ''' - Calculate the upper bound. - - Returns a unit Series with the highest value possible for the dtype of this - expression. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").upper_bound()) - shape: (1, 1) - ┌─────────────────────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════════════════════╡ - │ 9223372036854775807 │ - └─────────────────────┘ - - ''' - def sign(self) -> Self: - ''' - Compute the element-wise indication of the sign. - - The returned values can be -1, 0, or 1: - - * -1 if x < 0. - * 0 if x == 0. - * 1 if x > 0. - - (null values are preserved as-is). - - Examples - -------- - >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) - >>> df.select(pl.col("a").sign()) - shape: (5, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ -1 │ - │ 0 │ - │ 0 │ - │ 1 │ - │ null │ - └──────┘ - - ''' - def sin(self) -> Self: - ''' - Compute the element-wise value for the sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.0]}) - >>> df.select(pl.col("a").sin()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def cos(self) -> Self: - ''' - Compute the element-wise value for the cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.0]}) - >>> df.select(pl.col("a").cos()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def tan(self) -> Self: - ''' - Compute the element-wise value for the tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").tan().round(2)) - shape: (1, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ 1.56 │ - └──────┘ - - ''' - def cot(self) -> Self: - ''' - Compute the element-wise value for the cotangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").cot().round(2)) - shape: (1, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ 0.64 │ - └──────┘ - - ''' - def arcsin(self) -> Self: - ''' - Compute the element-wise value for the inverse sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arcsin()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.570796 │ - └──────────┘ - - ''' - def arccos(self) -> Self: - ''' - Compute the element-wise value for the inverse cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.0]}) - >>> df.select(pl.col("a").arccos()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.570796 │ - └──────────┘ - - ''' - def arctan(self) -> Self: - ''' - Compute the element-wise value for the inverse tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arctan()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.785398 │ - └──────────┘ - - ''' - def sinh(self) -> Self: - ''' - Compute the element-wise value for the hyperbolic sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").sinh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.175201 │ - └──────────┘ - - ''' - def cosh(self) -> Self: - ''' - Compute the element-wise value for the hyperbolic cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").cosh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.543081 │ - └──────────┘ - - ''' - def tanh(self) -> Self: - ''' - Compute the element-wise value for the hyperbolic tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").tanh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.761594 │ - └──────────┘ - - ''' - def arcsinh(self) -> Self: - ''' - Compute the element-wise value for the inverse hyperbolic sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arcsinh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.881374 │ - └──────────┘ - - ''' - def arccosh(self) -> Self: - ''' - Compute the element-wise value for the inverse hyperbolic cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arccosh()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def arctanh(self) -> Self: - ''' - Compute the element-wise value for the inverse hyperbolic tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arctanh()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ inf │ - └─────┘ - - ''' - def degrees(self) -> Self: - ''' - Convert from radians to degrees. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> import math - >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) - >>> df.select(pl.col("a").degrees()) - shape: (9, 1) - ┌────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞════════╡ - │ -720.0 │ - │ -540.0 │ - │ -360.0 │ - │ -180.0 │ - │ 0.0 │ - │ 180.0 │ - │ 360.0 │ - │ 540.0 │ - │ 720.0 │ - └────────┘ - ''' - def radians(self) -> Self: - ''' - Convert from degrees to radians. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) - >>> df.select(pl.col("a").radians()) - shape: (9, 1) - ┌────────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞════════════╡ - │ -12.566371 │ - │ -9.424778 │ - │ -6.283185 │ - │ -3.141593 │ - │ 0.0 │ - │ 3.141593 │ - │ 6.283185 │ - │ 9.424778 │ - │ 12.566371 │ - └────────────┘ - ''' - def reshape(self, dimensions: tuple[int, ...]) -> Self: - ''' - Reshape this Expr to a flat Series or a Series of Lists. - - Parameters - ---------- - dimensions - Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that - dimension is inferred. - - Returns - ------- - Expr - If a single dimension is given, results in an expression of the original - data type. - If a multiple dimensions are given, results in an expression of data type - :class:`List` with shape (rows, cols). - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) - >>> df.select(pl.col("foo").reshape((3, 3))) - shape: (3, 1) - ┌───────────┐ - │ foo │ - │ --- │ - │ list[i64] │ - ╞═══════════╡ - │ [1, 2, 3] │ - │ [4, 5, 6] │ - │ [7, 8, 9] │ - └───────────┘ - - See Also - -------- - Expr.list.explode : Explode a list column. - - ''' - def shuffle(self, seed: int | None = ...) -> Self: - ''' - Shuffle the contents of this expression. - - Parameters - ---------- - seed - Seed for the random number generator. If set to None (default), a - random seed is generated each time the shuffle is called. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").shuffle(seed=1)) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - │ 1 │ - │ 3 │ - └─────┘ - - ''' - def sample(self, n: int | IntoExprColumn | None = ...) -> Self: - ''' - Sample from this expression. - - Parameters - ---------- - n - Number of items to return. Cannot be used with `fraction`. Defaults to 1 if - `fraction` is None. - fraction - Fraction of items to return. Cannot be used with `n`. - with_replacement - Allow values to be sampled more than once. - shuffle - Shuffle the order of sampled data points. - seed - Seed for the random number generator. If set to None (default), a - random seed is generated for each sample operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 3 │ - │ 1 │ - │ 1 │ - └─────┘ - - ''' - def ewm_mean(self) -> Self: - ''' - Exponentially-weighted moving average. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").ewm_mean(com=1)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.0 │ - │ 1.666667 │ - │ 2.428571 │ - └──────────┘ - - ''' - def ewm_std(self) -> Self: - ''' - Exponentially-weighted moving standard deviation. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").ewm_std(com=1)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.0 │ - │ 0.707107 │ - │ 0.963624 │ - └──────────┘ - - ''' - def ewm_var(self) -> Self: - ''' - Exponentially-weighted moving variance. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").ewm_var(com=1)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.0 │ - │ 0.5 │ - │ 0.928571 │ - └──────────┘ - - ''' - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: - ''' - Extremely fast method for extending the Series with \'n\' copies of a value. - - Parameters - ---------- - value - A constant literal value (not an expression) with which to extend the - expression result Series; can pass None to extend with nulls. - n - The number of additional values that will be added. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1, 2, 3]}) - >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) - shape: (5, 1) - ┌────────┐ - │ values │ - │ --- │ - │ i64 │ - ╞════════╡ - │ 0 │ - │ 1 │ - │ 2 │ - │ 99 │ - │ 99 │ - └────────┘ - - ''' - def value_counts(self) -> Self: - ''' - Count the occurrences of unique values. - - Parameters - ---------- - sort - Sort the output by count in descending order. - If set to `False` (default), the order of the output is random. - parallel - Execute the computation in parallel. - - .. note:: - This option should likely not be enabled in a group by context, - as the computation is already parallelized per group. - - Returns - ------- - Expr - Expression of data type :class:`Struct` with mapping of unique values to - their count. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} - ... ) - >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT - shape: (3, 1) - ┌─────────────┐ - │ color │ - │ --- │ - │ struct[2] │ - ╞═════════════╡ - │ {"red",2} │ - │ {"green",1} │ - │ {"blue",3} │ - └─────────────┘ - - Sort the output by count. - - >>> df.select(pl.col("color").value_counts(sort=True)) - shape: (3, 1) - ┌─────────────┐ - │ color │ - │ --- │ - │ struct[2] │ - ╞═════════════╡ - │ {"blue",3} │ - │ {"red",2} │ - │ {"green",1} │ - └─────────────┘ - - ''' - def unique_counts(self) -> Self: - ''' - Return a count of the unique values in the order of appearance. - - This method differs from `value_counts` in that it does not return the - values, only the counts and might be faster - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "id": ["a", "b", "b", "c", "c", "c"], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("id").unique_counts(), - ... ] - ... ) - shape: (3, 1) - ┌─────┐ - │ id │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - ''' - def log(self, base: float = ...) -> Self: - ''' - Compute the logarithm to a given base. - - Parameters - ---------- - base - Given base, defaults to `e` - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").log(base=2)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.0 │ - │ 1.0 │ - │ 1.584963 │ - └──────────┘ - - ''' - def log1p(self) -> Self: - ''' - Compute the natural logarithm of each element plus one. - - This computes `log(1 + x)` but is more numerically stable for `x` close to zero. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").log1p()) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.693147 │ - │ 1.098612 │ - │ 1.386294 │ - └──────────┘ - - ''' - def entropy(self, base: float = ...) -> Self: - ''' - Computes the entropy. - - Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. - - Parameters - ---------- - base - Given base, defaults to `e` - normalize - Normalize pk if it doesn\'t sum to 1. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").entropy(base=2)) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.459148 │ - └──────────┘ - >>> df.select(pl.col("a").entropy(base=2, normalize=False)) - shape: (1, 1) - ┌───────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═══════════╡ - │ -6.754888 │ - └───────────┘ - - ''' - def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: - ''' - Run an expression over a sliding window that increases `1` slot every iteration. - - Parameters - ---------- - expr - Expression to evaluate - min_periods - Number of valid values there should be in the window before the expression - is evaluated. valid values = `length - null_count` - parallel - Run in parallel. Don\'t do this in a group by or another operation that - already has much parallelization. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - This can be really slow as it can have `O(n^2)` complexity. Don\'t use this - for operations that visit all elements. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) - >>> df.select( - ... [ - ... pl.col("values").cumulative_eval( - ... pl.element().first() - pl.element().last() ** 2 - ... ) - ... ] - ... ) - shape: (5, 1) - ┌────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞════════╡ - │ 0.0 │ - │ -3.0 │ - │ -8.0 │ - │ -15.0 │ - │ -24.0 │ - └────────┘ - - ''' - def set_sorted(self) -> Self: - ''' - Flags the expression as \'sorted\'. - - Enables downstream code to user fast paths for sorted arrays. - - Parameters - ---------- - descending - Whether the `Series` order is descending. - - Warnings - -------- - This can lead to incorrect results if this `Series` is not sorted!! - Use with care! - - Examples - -------- - >>> df = pl.DataFrame({"values": [1, 2, 3]}) - >>> df.select(pl.col("values").set_sorted().max()) - shape: (1, 1) - ┌────────┐ - │ values │ - │ --- │ - │ i64 │ - ╞════════╡ - │ 3 │ - └────────┘ - - ''' - def shrink_dtype(self) -> Self: - ''' - Shrink numeric columns to the minimal required datatype. - - Shrink to the dtype needed to fit the extrema of this [`Series`]. - This can be used to reduce memory pressure. - - Examples - -------- - >>> pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": [1, 2, 2 << 32], - ... "c": [-1, 2, 1 << 30], - ... "d": [-112, 2, 112], - ... "e": [-112, 2, 129], - ... "f": ["a", "b", "c"], - ... "g": [0.1, 1.32, 0.12], - ... "h": [True, None, False], - ... } - ... ).select(pl.all().shrink_dtype()) - shape: (3, 8) - ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ - ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ - │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ - │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ - │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ - └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ - - ''' - def cache(self) -> Self: - """ - Cache this expression so that it only is executed once per context. - - .. deprecated:: 0.18.9 - This method now does nothing. It has been superseded by the - `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically - caches expressions that are equal. - - """ - def replace(self, mapping: dict[Any, Any]) -> Self: - ''' - Replace values according to the given mapping. - - Needs a global string cache for lazily evaluated queries on columns of - type `Categorical`. - - Parameters - ---------- - mapping - Mapping of values to their replacement. - default - Value to use when the mapping does not contain the lookup value. - Defaults to keeping the original value. Accepts expression input. - Non-expression inputs are parsed as literals. - return_dtype - Set return dtype to override automatic return dtype determination. - - See Also - -------- - str.replace - - Examples - -------- - Replace a single value by another value. Values not in the mapping remain - unchanged. - - >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) - >>> df.with_columns(pl.col("a").replace({2: 100}).alias("replaced")) - shape: (4, 2) - ┌─────┬──────────┐ - │ a ┆ replaced │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════════╡ - │ 1 ┆ 1 │ - │ 2 ┆ 100 │ - │ 2 ┆ 100 │ - │ 3 ┆ 3 │ - └─────┴──────────┘ - - Replace multiple values. Specify a default to set values not in the given map - to the default value. - - >>> df = pl.DataFrame({"country_code": ["FR", "ES", "DE", None]}) - >>> country_code_map = { - ... "CA": "Canada", - ... "DE": "Germany", - ... "FR": "France", - ... None: "unspecified", - ... } - >>> df.with_columns( - ... pl.col("country_code") - ... .replace(country_code_map, default=None) - ... .alias("replaced") - ... ) - shape: (4, 2) - ┌──────────────┬─────────────┐ - │ country_code ┆ replaced │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞══════════════╪═════════════╡ - │ FR ┆ France │ - │ ES ┆ null │ - │ DE ┆ Germany │ - │ null ┆ unspecified │ - └──────────────┴─────────────┘ - - The return type can be overridden with the `return_dtype` argument. - - >>> df = df.with_row_count() - >>> df.select( - ... "row_nr", - ... pl.col("row_nr") - ... .replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) - ... .alias("replaced"), - ... ) - shape: (4, 2) - ┌────────┬──────────┐ - │ row_nr ┆ replaced │ - │ --- ┆ --- │ - │ u32 ┆ u8 │ - ╞════════╪══════════╡ - │ 0 ┆ 0 │ - │ 1 ┆ 10 │ - │ 2 ┆ 20 │ - │ 3 ┆ 0 │ - └────────┴──────────┘ - - To reference other columns as a `default` value, a struct column must be - constructed first. The first field must be the column in which values are - replaced. The other columns can be used in the default expression. - - >>> df.with_columns( - ... pl.struct("country_code", "row_nr") - ... .replace( - ... mapping=country_code_map, - ... default=pl.col("row_nr").cast(pl.Utf8), - ... ) - ... .alias("replaced") - ... ) - shape: (4, 3) - ┌────────┬──────────────┬─────────────┐ - │ row_nr ┆ country_code ┆ replaced │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ str ┆ str │ - ╞════════╪══════════════╪═════════════╡ - │ 0 ┆ FR ┆ France │ - │ 1 ┆ ES ┆ 1 │ - │ 2 ┆ DE ┆ Germany │ - │ 3 ┆ null ┆ unspecified │ - └────────┴──────────────┴─────────────┘ - ''' - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: - """ - Apply a custom python function to a Series or sequence of Series. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Expr.map_batches`. - - Parameters - ---------- - function - Lambda/ function to apply. - return_dtype - Dtype of the output Series. - agg_list - Aggregate list - - """ - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - """ - Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Expr.map_elements`. - - Parameters - ---------- - function - Lambda/ function to apply. - return_dtype - Dtype of the output Series. - If not set, the dtype will be - `polars.Unknown`. - skip_nulls - Don't apply the function over values - that contain nulls. This is faster. - pass_name - Pass the Series name to the custom function - This is more expensive. - strategy : {'thread_local', 'threading'} - This functionality is in `alpha` stage. This may be removed - /changed without it being considered a breaking change. - - - 'thread_local': run the python function on a single thread. - - 'threading': run the python function on separate threads. Use with - care as this can slow performance. This might only speed up - your code if the amount of work per element is significant - and the python function releases the GIL (e.g. via calling - a c function) - - """ - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - """ - Apply a custom rolling window function. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Expr.rolling_map`. - - Parameters - ---------- - function - Aggregation function - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - """ - def is_first(self) -> Self: - """ - Return a boolean mask indicating the first occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Expr.is_first_distinct`. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - """ - def is_last(self) -> Self: - """ - Return a boolean mask indicating the last occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Expr.is_last_distinct`. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - """ - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: - """ - Clip (limit) the values in an array to a `min` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - lower_bound - Lower bound. - - """ - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: - """ - Clip (limit) the values in an array to a `max` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - upper_bound - Upper bound. - - """ - def shift_and_fill(self, fill_value: IntoExpr) -> Self: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - Fill None values with the result of this expression. - n - Number of places to shift (may be negative). - - """ - def register_plugin(self) -> Self: - """ - Register a shared library as a plugin. - - .. warning:: - This is highly unsafe as this will call the C function - loaded by `lib::symbol`. - - The parameters you give dictate how polars will deal - with the function. Make sure they are correct! - - .. note:: - This functionality is unstable and may change without it - being considered breaking. - - Parameters - ---------- - lib - Library to load. - symbol - Function to load. - args - Arguments (other than self) passed to this function. - These arguments have to be of type Expression. - kwargs - Non-expression arguments. They must be JSON serializable. - is_elementwise - If the function only operates on scalars - this will trigger fast paths. - input_wildcard_expansion - Expand expressions as input of this function. - returns_scalar - Automatically explode on unit length if it ran as final aggregation. - this is the case for aggregations like `sum`, `min`, `covariance` etc. - cast_to_supertypes - Cast the input datatypes to their supertype. - pass_name_to_apply - if set, then the `Series` passed to the function in the group_by operation - will ensure the name is set. This is an extra heap allocation per group. - changes_length - For example a `unique` or a `slice` - - """ - def _register_plugin(self) -> Self: ... - def take_every(self, n: int) -> Self: - """ - Take every nth value in the Series and return as a new Series. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: - """ - Take values by index. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather`. - - Parameters - ---------- - indices - An expression that leads to a UInt32 dtyped Series. - """ - def cumsum(self) -> Self: - """ - Get an array with the cumulative sum computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_sum`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cumprod(self) -> Self: - """ - Get an array with the cumulative product computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_prod`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cummin(self) -> Self: - """ - Get an array with the cumulative min computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_min`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cummax(self) -> Self: - """ - Get an array with the cumulative max computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_max`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cumcount(self) -> Self: - """ - Get an array with the cumulative count computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_count`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def map_dict(self, mapping: dict[Any, Any]) -> Self: - """ - Replace values in column according to remapping dictionary. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`replace`. The default behavior - has changed to keep any values not present in the mapping unchanged. - Pass `default=None` to keep existing behavior. - - Parameters - ---------- - mapping - Dictionary containing the before/after values to map. - default - Value to use when the remapping dict does not contain the lookup value. - Accepts expression input. Non-expression inputs are parsed as literals. - Use `pl.first()`, to keep the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - - """ - @property - def bin(self): ... - @property - def cat(self): ... - @property - def dt(self): ... - @property - def list(self): ... - @property - def arr(self): ... - @property - def meta(self): ... - @property - def name(self): ... - @property - def str(self): ... - @property - def struct(self): ... -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: - """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/expr/expr.pyi new file mode 100644 index 0000000..ec87157 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/expr/expr.pyi @@ -0,0 +1,8386 @@ +#: version 0.20.2 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import UInt32 as UInt32 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import _warn_null_comparison as _warn_null_comparison, no_default as no_default, sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self) -> Self: + ''' + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.map`. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + keep_name + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.prefix`. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.suffix`. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.keep`. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).name.keep()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with `^` and end with `$`. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns( + ... pl.all().is_not_null().name.suffix("_not_null") # nan != null + ... ) + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Return the number of non-null elements in the column. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + See Also + -------- + len + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 2 │ + └─────┴─────┘ + ''' + def len(self) -> Self: + ''' + Return the number of elements in the column. + + Null values count towards the total. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + See Also + -------- + count + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + + ''' + def cum_sum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_sum().alias("cum_sum"), + ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_sum ┆ cum_sum_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 10 │ + │ 2 ┆ 3 ┆ 9 │ + │ 3 ┆ 6 ┆ 7 │ + │ 4 ┆ 10 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_sum().alias("value_cum_sum"), + ... pl.col("values") + ... .cum_sum() + ... .forward_fill() + ... .alias("value_cum_sum_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬───────────────┬──────────────────────────┐ + │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═══════════════╪══════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴───────────────┴──────────────────────────┘ + + ''' + def cum_prod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_prod().alias("cum_prod"), + ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), + ... ) + shape: (4, 3) + ┌─────┬──────────┬──────────────────┐ + │ a ┆ cum_prod ┆ cum_prod_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════════╪══════════════════╡ + │ 1 ┆ 1 ┆ 24 │ + │ 2 ┆ 2 ┆ 24 │ + │ 3 ┆ 6 ┆ 12 │ + │ 4 ┆ 24 ┆ 4 │ + └─────┴──────────┴──────────────────┘ + + ''' + def cum_min(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_min().alias("cum_min"), + ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_min ┆ cum_min_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 1 ┆ 3 │ + │ 4 ┆ 1 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + ''' + def cum_max(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_max().alias("cum_max"), + ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_max ┆ cum_max_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 2 ┆ 4 │ + │ 3 ┆ 3 ┆ 4 │ + │ 4 ┆ 4 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_max().alias("cum_max"), + ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬─────────┬────────────────────┐ + │ values ┆ cum_max ┆ cum_max_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════════╪════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴─────────┴────────────────────┘ + + ''' + def cum_count(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_count().alias("cum_count"), + ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), + ... ) + shape: (4, 3) + ┌─────┬───────────┬───────────────────┐ + │ a ┆ cum_count ┆ cum_count_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ u32 ┆ u32 │ + ╞═════╪═══════════╪═══════════════════╡ + │ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 2 ┆ 1 │ + │ 4 ┆ 3 ┆ 0 │ + └─────┴───────────┴───────────────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def round_sig_figs(self, digits: int) -> Self: + ''' + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) + >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) + shape: (3, 2) + ┌─────────┬────────────────┐ + │ a ┆ round_sig_figs │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════════╪════════════════╡ + │ 0.01234 ┆ 0.012 │ + │ 3.333 ┆ 3.3 │ + │ 1234.0 ┆ 1200.0 │ + └─────────┴────────────────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + See Also + -------- + Expr.get : Take a single value + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg( + ... pl.col("value").gather([2, 1]) + ... ) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ one ┆ [2, 98] │ + │ two ┆ [4, 99] │ + └───────┴───────────┘ + ''' + def get(self, index: int | Expr) -> Self: + ''' + Return a single value by index. + + Parameters + ---------- + index + An expression that leads to a UInt32 index. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns(shift=pl.col("a").shift()) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴───────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.with_columns(shift=pl.col("a").shift(-2)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ null │ + │ 4 ┆ null │ + └─────┴───────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ 100 │ + │ 4 ┆ 100 │ + └─────┴───────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().name.suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns( + ... pl.col("c").max().over("a").name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns( + ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns( + ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns( + ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def rolling(self, index_column: str) -> Self: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), + ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for `map` functions is transforming the values + represented by an expression using a third-party library. + + .. warning:: + If you are looking to map a function over a window function or group_by + context, refer to :func:`map_elements` instead. + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_elements + replace + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type `Callable[[Any], Any]`. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type `Callable[[Series], Any]`. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be `pl.Unknown`. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using `map_elements` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using `over` is considered a GroupBy context + here, so `map_elements` can be used to map functions over window groups. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using `over` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort( + ... "key" + ... ) # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").gather_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator `expr & other & ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator `expr | other | ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other` where `None == None`. + + This differs from default `eq` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator `expr >= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ true │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator `expr > other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator `expr <= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator `expr < other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator `expr != other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr != other` where `None == None`. + + This differs from default `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator `expr + other`. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator `expr // other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator `expr % other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator `expr * other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator `expr - other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator `expr / other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator `expr ** exponent`. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator `expr ^ other`. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) + shape: (3, 3) + ┌───────────┬──────────────────┬──────────┐ + │ sets ┆ optional_members ┆ contains │ + │ --- ┆ --- ┆ --- │ + │ list[i64] ┆ i64 ┆ bool │ + ╞═══════════╪══════════════════╪══════════╡ + │ [1, 2, 3] ┆ 1 ┆ true │ + │ [1, 2] ┆ 2 ┆ true │ + │ [9, 10] ┆ 3 ┆ false │ + └───────────┴──────────────────┴──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with `lit` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ NaN │ + │ 3.0 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: + ''' + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) + >>> df.with_columns(clip=pl.col("a").clip(1, 10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + Specifying only a single bound: + + >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def cot(self) -> Self: + ''' + Compute the element-wise value for the cotangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cot().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 0.64 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | IntoExprColumn | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def hist(self, bins: IntoExpr | None = ...) -> Self: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + include_breakpoint + Include a column that indicates the upper breakpoint. + include_category + Include a column that shows the intervals as categories. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 3, 8, 8, 2, 1, 3]}) + >>> df.select(pl.col("a").hist(bins=[1, 2, 3])) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 2 │ + │ 2 │ + └─────┘ + >>> df.select( + ... pl.col("a").hist( + ... bins=[1, 2, 3], include_breakpoint=True, include_category=True + ... ) + ... ) + shape: (4, 1) + ┌───────────────────────┐ + │ a │ + │ --- │ + │ struct[3] │ + ╞═══════════════════════╡ + │ {1.0,"(-inf, 1.0]",2} │ + │ {2.0,"(1.0, 2.0]",1} │ + │ {3.0,"(2.0, 3.0]",2} │ + │ {inf,"(3.0, inf]",2} │ + └───────────────────────┘ + + ''' + def replace(self, old: IntoExpr | Sequence[Any] | Mapping[Any, Any], new: IntoExpr | Sequence[Any] | NoDefault = ...) -> Self: + ''' + Replace values by different values. + + Parameters + ---------- + old + Value or sequence of values to replace. + Accepts expression input. Sequences are parsed as Series, + other non-expression inputs are parsed as literals. + Also accepts a mapping of values to their replacement as syntactic sugar for + `replace(new=Series(mapping.keys()), old=Series(mapping.values()))`. + new + Value or sequence of values to replace by. + Accepts expression input. Sequences are parsed as Series, + other non-expression inputs are parsed as literals. + Length must match the length of `old` or have length 1. + default + Set values that were not replaced to this value. + Defaults to keeping the original value. + Accepts expression input. Non-expression inputs are parsed as literals. + return_dtype + The data type of the resulting expression. If set to `None` (default), + the data type is determined automatically based on the other inputs. + + See Also + -------- + str.replace + + Notes + ----- + The global string cache must be enabled when replacing categorical values. + + Examples + -------- + Replace a single value by another value. Values that were not replaced remain + unchanged. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) + >>> df.with_columns(replaced=pl.col("a").replace(2, 100)) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 3 │ + └─────┴──────────┘ + + Replace multiple values by passing sequences to the `old` and `new` parameters. + + >>> df.with_columns(replaced=pl.col("a").replace([2, 3], [100, 200])) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 200 │ + └─────┴──────────┘ + + Passing a mapping with replacements is also supported as syntactic sugar. + Specify a default to set all values that were not matched. + + >>> mapping = {2: 100, 3: 200} + >>> df.with_columns(replaced=pl.col("a").replace(mapping, default=-1)) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ -1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 200 │ + └─────┴──────────┘ + + Replacing by values of a different data type sets the return type based on + a combination of the `new` data type and either the original data type or the + default data type if it was set. + + >>> df = pl.DataFrame({"a": ["x", "y", "z"]}) + >>> mapping = {"x": 1, "y": 2, "z": 3} + >>> df.with_columns(replaced=pl.col("a").replace(mapping)) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + >>> df.with_columns(replaced=pl.col("a").replace(mapping, default=None)) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + + Set the `return_dtype` parameter to control the resulting data type directly. + + >>> df.with_columns( + ... replaced=pl.col("a").replace(mapping, return_dtype=pl.UInt8) + ... ) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ u8 │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + + Expression input is supported for all parameters. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3], "b": [1.5, 2.5, 5.0, 1.0]}) + >>> df.with_columns( + ... replaced=pl.col("a").replace( + ... old=pl.col("a").max(), + ... new=pl.col("b").sum(), + ... default=pl.col("b"), + ... ) + ... ) + shape: (4, 3) + ┌─────┬─────┬──────────┐ + │ a ┆ b ┆ replaced │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═════╪══════════╡ + │ 1 ┆ 1.5 ┆ 1.5 │ + │ 2 ┆ 2.5 ┆ 2.5 │ + │ 2 ┆ 5.0 ┆ 5.0 │ + │ 3 ┆ 1.0 ┆ 10.0 │ + └─────┴─────┴──────────┘ + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + `polars.Unknown`. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def register_plugin(self) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by `lib::symbol`. + + The parameters you give dictate how polars will deal + with the function. Make sure they are correct! + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + These arguments have to be of type Expression. + kwargs + Non-expression arguments. They must be JSON serializable. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + returns_scalar + Automatically explode on unit length if it ran as final aggregation. + this is the case for aggregations like `sum`, `min`, `covariance` etc. + cast_to_supertypes + Cast the input datatypes to their supertype. + pass_name_to_apply + if set, then the `Series` passed to the function in the group_by operation + will ensure the name is set. This is an extra heap allocation per group. + changes_length + For example a `unique` or a `slice` + + """ + def _register_plugin(self) -> Self: ... + def take_every(self, n: int) -> Self: + """ + Take every nth value in the Series and return as a new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + """ + def cumsum(self) -> Self: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumprod(self) -> Self: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummin(self) -> Self: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummax(self) -> Self: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumcount(self) -> Self: + """ + Get an array with the cumulative count computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_count`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in column according to remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def name(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/lazyframe/frame deleted file mode 100644 index 561f5b2..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/lazyframe/frame +++ /dev/null @@ -1,4211 +0,0 @@ -import P -import np -import pa -from builtins import PyLazyFrame -from pathlib import Path -from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 -from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype -from polars.dependencies import dataframe_api_compat as dataframe_api_compat -from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud -from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy -from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath -from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence - -TYPE_CHECKING: bool -DTYPE_TEMPORAL_UNITS: frozenset -N_INFER_DEFAULT: int - -class LazyFrame: - _accessors: _ClassVar[set] = ... - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - @classmethod - def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: - """ - Lazily read from a CSV file or multiple files via glob patterns. - - Use `pl.scan_csv` to dispatch to this method. - - See Also - -------- - polars.io.scan_csv - - """ - @classmethod - def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: - """ - Lazily read from a parquet file or multiple files via glob patterns. - - Use `pl.scan_parquet` to dispatch to this method. - - See Also - -------- - polars.io.scan_parquet - - """ - @classmethod - def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: - """ - Lazily read from an Arrow IPC (Feather v2) file. - - Use `pl.scan_ipc` to dispatch to this method. - - See Also - -------- - polars.io.scan_ipc - - """ - @classmethod - def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: - """ - Lazily read from a newline delimited JSON file. - - Use `pl.scan_ndjson` to dispatch to this method. - - See Also - -------- - polars.io.scan_ndjson - - """ - @classmethod - def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: - """ - Read a logical plan from a JSON string to construct a LazyFrame. - - .. deprecated:: 0.18.12 - This method is deprecated. Convert the JSON string to `StringIO` - and then use `LazyFrame.deserialize`. - - Parameters - ---------- - json - String in JSON format. - - See Also - -------- - deserialize - - """ - @classmethod - def read_json(cls, source: str | Path | IOBase) -> Self: - """ - Read a logical plan from a JSON file to construct a LazyFrame. - - .. deprecated:: 0.18.12 - This class method has been renamed to `deserialize`. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - - See Also - -------- - deserialize - - """ - @classmethod - def deserialize(cls, source: str | Path | IOBase) -> Self: - ''' - Read a logical plan from a JSON file to construct a LazyFrame. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - - See Also - -------- - LazyFrame.serialize - - Examples - -------- - >>> import io - >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() - >>> json = lf.serialize() - >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 6 │ - └─────┘ - - ''' - def __dataframe_consortium_standard__(self) -> Any: - """ - Provide entry point to the Consortium DataFrame Standard API. - - This is developed and maintained outside of polars. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. - """ - def __bool__(self) -> NoReturn: ... - def _comparison_error(self, operator: str) -> NoReturn: ... - def __eq__(self, other: Any) -> NoReturn: ... - def __ne__(self, other: Any) -> NoReturn: ... - def __gt__(self, other: Any) -> NoReturn: ... - def __lt__(self, other: Any) -> NoReturn: ... - def __ge__(self, other: Any) -> NoReturn: ... - def __le__(self, other: Any) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def _repr_html_(self) -> str: ... - def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: - ''' - Serialize the logical plan of this LazyFrame to a file or string in JSON format. - - Parameters - ---------- - file - File path to which the result should be written. If set to `None` - (default), the output is returned as a string instead. - - See Also - -------- - LazyFrame.deserialize - - Examples - -------- - Serialize the logical plan into a JSON string. - - >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() - >>> json = lf.serialize() - >>> json - \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' - - The logical plan can later be deserialized back into a LazyFrame. - - >>> import io - >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 6 │ - └─────┘ - - ''' - def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: - """ - Serialize the logical plan of this LazyFrame to a file or string in JSON format. - - .. deprecated:: 0.18.12 - This method has been renamed to :func:`LazyFrame.serialize`. - - Parameters - ---------- - file - File path to which the result should be written. If set to `None` - (default), the output is returned as a string instead. - """ - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: - ''' - Offers a structured way to apply a sequence of user-defined functions (UDFs). - - Parameters - ---------- - function - Callable; will receive the frame as the first parameter, - followed by any given args/kwargs. - *args - Arguments to pass to the UDF. - **kwargs - Keyword arguments to pass to the UDF. - - Examples - -------- - >>> def cast_str_to_int(data, col_name): - ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) - ... - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": ["10", "20", "30", "40"], - ... } - ... ) - >>> lf.pipe(cast_str_to_int, col_name="b").collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 10 │ - │ 2 ┆ 20 │ - │ 3 ┆ 30 │ - │ 4 ┆ 40 │ - └─────┴─────┘ - - >>> lf = pl.LazyFrame( - ... { - ... "b": [1, 2], - ... "a": [3, 4], - ... } - ... ) - >>> lf.collect() - shape: (2, 2) - ┌─────┬─────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - └─────┴─────┘ - >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 1 │ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def explain(self) -> str: - ''' - Create a string representation of the query plan. - - Different optimizations can be turned on or off. - - Parameters - ---------- - optimized - Return an optimized query plan. Defaults to `True`. - If this is set to `True` the subsequent - optimization flags control which optimizations - run. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( - ... "a" - ... ).explain() # doctest: +SKIP - ''' - def show_graph(self) -> str | None: - ''' - Show a plot of the query plan. Note that you should have graphviz installed. - - Parameters - ---------- - optimized - Optimize the query plan. - show - Show the figure. - output_path - Write the figure to disk. - raw_output - Return dot syntax. This cannot be combined with `show` and/or `output_path`. - figsize - Passed to matplotlib if `show` == True. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( - ... "a" - ... ).show_graph() # doctest: +SKIP - - ''' - def inspect(self, fmt: str = ...) -> Self: - ''' - Inspect a node in the computation graph. - - Print the value that this node in the computation graph evaluates to and passes - on the value. - - Examples - -------- - >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) - >>> ( - ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) - ... .inspect() # print the node before the filter - ... .filter(pl.col("bar") == pl.col("foo")) - ... ) # doctest: +ELLIPSIS - - - ''' - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: - ''' - Sort the DataFrame by the given columns. - - Parameters - ---------- - by - Column(s) to sort by. Accepts expression input. Strings are parsed as column - names. - *more_by - Additional columns to sort by, specified as positional arguments. - descending - Sort in descending order. When sorting by multiple columns, can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - Examples - -------- - Pass a single column name to sort by that column. - - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, None], - ... "b": [6.0, 5.0, 4.0], - ... "c": ["a", "c", "b"], - ... } - ... ) - >>> lf.sort("a").collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - Sorting by expressions is also supported. - - >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - └──────┴─────┴─────┘ - - Sort by multiple columns by passing a list of columns. - - >>> lf.sort(["c", "a"], descending=True).collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - └──────┴─────┴─────┘ - - Or use positional arguments to sort by multiple columns in the same way. - - >>> lf.sort("c", "a", descending=[False, True]).collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - ''' - def top_k(self, k: int) -> Self: - ''' - Return the `k` largest elements. - - If \'descending=True` the smallest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might - be worse since this requires a stable search. - - See Also - -------- - bottom_k - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 largest values in column b. - - >>> lf.top_k(4, by="b").collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ a ┆ 2 │ - │ b ┆ 2 │ - │ b ┆ 1 │ - └─────┴─────┘ - - Get the rows which contain the 4 largest values when sorting on column b and a. - - >>> lf.top_k(4, by=["b", "a"]).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 2 │ - │ c ┆ 1 │ - └─────┴─────┘ - - ''' - def bottom_k(self, k: int) -> Self: - ''' - Return the `k` smallest elements. - - If \'descending=True` the largest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - See Also - -------- - top_k - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 smallest values in column b. - - >>> lf.bottom_k(4, by="b").collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 1 │ - │ a ┆ 1 │ - │ c ┆ 1 │ - │ a ┆ 2 │ - └─────┴─────┘ - - Get the rows which contain the 4 smallest values when sorting on column a and b. - - >>> lf.bottom_k(4, by=["a", "b"]).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ b ┆ 1 │ - │ b ┆ 2 │ - └─────┴─────┘ - - ''' - def profile(self) -> tuple[DataFrame, DataFrame]: - ''' - Profile a LazyFrame. - - This will run the query and return a tuple - containing the materialized DataFrame and a DataFrame that - contains profiling information of each node that is executed. - - The units of the timings are microseconds. - - Parameters - ---------- - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off (certain) optimizations. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - show_plot - Show a gantt chart of the profiling result - truncate_nodes - Truncate the label lengths in the gantt chart to this number of - characters. - figsize - matplotlib figsize of the profiling plot - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( - ... "a" - ... ).profile() # doctest: +SKIP - (shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘, - shape: (3, 3) - ┌─────────────────────────┬───────┬──────┐ - │ node ┆ start ┆ end │ - │ --- ┆ --- ┆ --- │ - │ str ┆ u64 ┆ u64 │ - ╞═════════════════════════╪═══════╪══════╡ - │ optimization ┆ 0 ┆ 5 │ - │ group_by_partitioned(a) ┆ 5 ┆ 470 │ - │ sort(a) ┆ 475 ┆ 1964 │ - └─────────────────────────┴───────┴──────┘) - - ''' - def collect(self) -> DataFrame: - ''' - Materialize this LazyFrame into a DataFrame. - - By default, all query optimizations are enabled. Individual optimizations may - be disabled by setting the corresponding parameter to `False`. - - Parameters - ---------- - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - no_optimization - Turn off (certain) optimizations. - streaming - Process the query in batches to handle larger-than-memory data. - If set to `False` (default), the entire query is processed in a single - batch. - - .. warning:: - This functionality is currently in an alpha state. - - .. note:: - Use :func:`explain` to see if Polars can process the query in streaming - mode. - - Returns - ------- - DataFrame - - See Also - -------- - fetch: Run the query on the first `n` rows only for debugging purposes. - explain : Print the query plan that is evaluated with collect. - profile : Collect the LazyFrame and time each node in the computation graph. - polars.collect_all : Collect multiple LazyFrames at the same time. - polars.Config.set_streaming_chunk_size : Set the size of streaming batches. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘ - - Collect in streaming mode - - >>> lf.group_by("a").agg(pl.all().sum()).collect( - ... streaming=True - ... ) # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘ - - ''' - def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: - ''' - Collect DataFrame asynchronously in thread pool. - - Collects into a DataFrame (like :func:`collect`), but instead of returning - DataFrame directly, they are scheduled to be collected inside thread pool, - while this method returns almost instantly. - - May be useful if you use gevent or asyncio and want to release control to other - greenlets/tasks while LazyFrames are being collected. - - Parameters - ---------- - gevent - Return wrapper to `gevent.event.AsyncResult` instead of Awaitable - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off (certain) optimizations. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Notes - ----- - In case of error `set_exception` is used on - `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - See Also - -------- - polars.collect_all : Collect multiple LazyFrames at the same time. - polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. - - Returns - ------- - If `gevent=False` (default) then returns awaitable. - - If `gevent=True` then returns wrapper that has - `.get(block=True, timeout=None)` method. - - Examples - -------- - >>> import asyncio - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> async def main(): - ... return await ( - ... lf.group_by("a", maintain_order=True) - ... .agg(pl.all().sum()) - ... .collect_async() - ... ) - ... - >>> asyncio.run(main()) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘ - ''' - def sink_parquet(self, path: str | Path) -> DataFrame: - ''' - Evaluate the query in streaming mode and write to a Parquet file. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} - Choose "zstd" for good compression performance. - Choose "lz4" for fast compression/decompression. - Choose "snappy" for more backwards compatibility guarantees - when you deal with older parquet readers. - compression_level - The level of compression to use. Higher compression means smaller files on - disk. - - - "gzip" : min-level: 0, max-level: 10. - - "brotli" : min-level: 0, max-level: 11. - - "zstd" : min-level: 1, max-level: 22. - statistics - Write statistics to the parquet headers. This requires extra compute. - row_group_size - Size of the row groups in number of rows. - If None (default), the chunks of the `DataFrame` are - used. Writing in smaller chunks may reduce memory pressure and improve - writing speeds. - data_pagesize_limit - Size limit of individual data pages. - If not set defaults to 1024 * 1024 bytes - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - no_optimization - Turn off (certain) optimizations. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_parquet("out.parquet") # doctest: +SKIP - - ''' - def sink_ipc(self, path: str | Path) -> DataFrame: - ''' - Evaluate the query in streaming mode and write to an IPC file. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - compression : {\'lz4\', \'zstd\'} - Choose "zstd" for good compression performance. - Choose "lz4" for fast compression/decompression. - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - no_optimization - Turn off (certain) optimizations. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_ipc("out.arrow") # doctest: +SKIP - - ''' - def sink_csv(self, path: str | Path) -> DataFrame: - ''' - Evaluate the query in streaming mode and write to a CSV file. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - include_bom - Whether to include UTF-8 BOM in the CSV output. - include_header - Whether to include header in the CSV output. - separator - Separate CSV fields with this symbol. - line_terminator - String used to end each row. - quote_char - Byte to use as quoting character. - batch_size - Number of rows that will be processed per thread. - datetime_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. If no format specified, the default fractional-second - precision is inferred from the maximum timeunit found in the frame\'s - Datetime cols (if any). - date_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - time_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - float_precision - Number of decimal places to write, applied to both `Float32` and - `Float64` datatypes. - null_value - A string representing null values (defaulting to the empty string). - quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} - Determines the quoting strategy used. - - - necessary (default): This puts quotes around fields only when necessary. - They are necessary when fields contain a quote, - delimiter or record terminator. - Quotes are also necessary when writing an empty record - (which is indistinguishable from a record with one empty field). - This is the default. - - always: This puts quotes around every field. Always. - - never: This never puts quotes around fields, even if that results in - invalid CSV data (e.g.: by not quoting strings containing the - separator). - - non_numeric: This puts quotes around all fields that are non-numeric. - Namely, when writing a field that does not parse as a valid float - or integer, then quotes will be used even if they aren`t strictly - necessary. - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - no_optimization - Turn off (certain) optimizations. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_csv("out.csv") # doctest: +SKIP - - ''' - def sink_ndjson(self, path: str | Path) -> DataFrame: - ''' - Persists a LazyFrame at the provided path. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off (certain) optimizations. - slice_pushdown - Slice pushdown optimization. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_json("out.json") # doctest: +SKIP - - ''' - def _set_sink_optimizations(self) -> PyLazyFrame: ... - def fetch(self, n_rows: int = ...) -> DataFrame: - ''' - Collect a small number of rows for debugging purposes. - - Parameters - ---------- - n_rows - Collect n_rows from the data sources. - type_coercion - Run type coercion optimization. - predicate_pushdown - Run predicate pushdown optimization. - projection_pushdown - Run projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off optimizations. - slice_pushdown - Slice pushdown optimization - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Notes - ----- - This is similar to a :func:`collect` operation, but it overwrites the number of - rows read by *every* scan operation. Be aware that `fetch` does not guarantee - the final number of rows in the DataFrame. Filters, join operations and fewer - rows being available in the scanned data will all influence the final number - of rows (joins are especially susceptible to this, and may return no data - at all if `n_rows` is too small as the join keys may not be present). - - Warnings - -------- - This is strictly a utility function that can help to debug queries using a - smaller number of rows, and should *not* be used in production code. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 6 │ - │ b ┆ 2 ┆ 5 │ - └─────┴─────┴─────┘ - - ''' - def lazy(self) -> Self: - ''' - Return lazy representation, i.e. itself. - - Useful for writing code that expects either a :class:`DataFrame` or - :class:`LazyFrame`. - - Returns - ------- - LazyFrame - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> lf.lazy() # doctest: +ELLIPSIS - - - ''' - def cache(self) -> Self: - """Cache the result once the execution of the physical plan hits this node.""" - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: - ''' - Cast LazyFrame column(s) to the specified dtype(s). - - Parameters - ---------- - dtypes - Mapping of column names (or selector) to dtypes, or a single dtype - to which all columns will be cast. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> from datetime import date - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], - ... } - ... ) - - Cast specific frame columns to the specified dtypes: - - >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ u8 ┆ date │ - ╞═════╪═════╪════════════╡ - │ 1.0 ┆ 6 ┆ 2020-01-02 │ - │ 2.0 ┆ 7 ┆ 2021-03-04 │ - │ 3.0 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - Cast all frame columns to the specified dtype: - - >>> lf.cast(pl.Utf8).collect().to_dict(as_series=False) - {\'foo\': [\'1\', \'2\', \'3\'], - \'bar\': [\'6.0\', \'7.0\', \'8.0\'], - \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} - - Use selectors to define the columns being cast: - - >>> import polars.selectors as cs - >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ str │ - ╞═════╪═════╪════════════╡ - │ 1 ┆ 6 ┆ 2020-01-02 │ - │ 2 ┆ 7 ┆ 2021-03-04 │ - │ 3 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - ''' - def clear(self, n: int = ...) -> LazyFrame: - ''' - Create an empty copy of the current LazyFrame, with zero to \'n\' rows. - - Returns a copy with an identical schema but no data. - - Parameters - ---------- - n - Number of (empty) rows to return in the cleared frame. - - See Also - -------- - clone : Cheap deepcopy/clone. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> lf.clear().fetch() - shape: (0, 3) - ┌─────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞═════╪═════╪══════╡ - └─────┴─────┴──────┘ - - >>> lf.clear(2).fetch() - shape: (2, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪══════╪══════╡ - │ null ┆ null ┆ null │ - │ null ┆ null ┆ null │ - └──────┴──────┴──────┘ - - ''' - def clone(self) -> Self: - ''' - Create a copy of this LazyFrame. - - This is a cheap operation that does not copy data. - - See Also - -------- - clear : Create an empty copy of the current LazyFrame, with identical - schema but no data. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> lf.clone() # doctest: +ELLIPSIS - - - ''' - def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: - ''' - Filter the rows in the LazyFrame based on a predicate expression. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - predicates - Expression that evaluates to a boolean Series. - constraints - Column filters. Use name=value to filter column name by the supplied value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - - Filter on one condition: - - >>> lf.filter(pl.col("foo") > 1).collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Filter on multiple conditions: - - >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Provide multiple filters using `*args` syntax: - - >>> lf.filter( - ... pl.col("foo") == 1, - ... pl.col("ham") == "a", - ... ).collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Provide multiple filters using `**kwargs` syntax: - - >>> lf.filter(foo=1, ham="a").collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Filter on an OR condition: - - >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - ''' - Select columns from this LazyFrame. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Examples - -------- - Pass the name of a column to select that column. - - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.select("foo").collect() - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - Multiple columns can be selected by passing a list of column names. - - >>> lf.select(["foo", "bar"]).collect() - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 6 │ - │ 2 ┆ 7 │ - │ 3 ┆ 8 │ - └─────┴─────┘ - - Multiple columns can also be selected using positional arguments instead of a - list. Expressions are also accepted. - - >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - └─────┴─────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> lf.select( - ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) - ... ).collect() - shape: (3, 1) - ┌───────────┐ - │ threshold │ - │ --- │ - │ i32 │ - ╞═══════════╡ - │ 0 │ - │ 0 │ - │ 10 │ - └───────────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... lf.select( - ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), - ... ).collect() - ... - shape: (3, 1) - ┌───────────┐ - │ is_odd │ - │ --- │ - │ struct[2] │ - ╞═══════════╡ - │ {1,0} │ - │ {0,1} │ - │ {1,0} │ - └───────────┘ - - ''' - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - """ - Select columns from this LazyFrame. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - See Also - -------- - select - - """ - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: - ''' - Start a group by operation. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Setting this to `True` blocks the possibility - to run on the streaming engine. - - Examples - -------- - Group by one column and call `agg` to compute the grouped sum of another - column. - - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "c"], - ... "b": [1, 2, 1, 3, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 2 │ - │ b ┆ 5 │ - │ c ┆ 3 │ - └─────┴─────┘ - - Set `maintain_order=True` to ensure the order of the groups is consistent with - the input. - - >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() - shape: (3, 2) - ┌─────┬───────────┐ - │ a ┆ c │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════╪═══════════╡ - │ a ┆ [5, 3] │ - │ b ┆ [4, 2] │ - │ c ┆ [1] │ - └─────┴───────────┘ - - Group by multiple columns by passing a list of column names. - - >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - - Or use positional arguments to group by multiple columns in the same way. - Expressions are also accepted. - - >>> lf.group_by("a", pl.col("b") // 2).agg( - ... pl.col("c").mean() - ... ).collect() # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ f64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 0 ┆ 4.0 │ - │ b ┆ 1 ┆ 3.0 │ - │ c ┆ 1 ┆ 1.0 │ - └─────┴─────┴─────┘ - - ''' - def rolling(self, index_column: IntoExpr) -> LazyGroupBy: - ''' - Create rolling groups based on a time, Int32, or Int64 column. - - Different from a `dynamic_group_by` the windows are now determined by the - individual values and are not of constant intervals. For constant intervals - use :func:`LazyFrame.group_by_dynamic`. - - If you have a time series ``, then by default the - windows created will be - - * (t_0 - period, t_0] - * (t_1 - period, t_1] - * ... - * (t_n - period, t_n] - - whereas if you pass a non-default `offset`, then the windows will be - - * (t_0 + offset, t_0 + offset + period] - * (t_1 + offset, t_1 + offset + period] - * ... - * (t_n + offset, t_n + offset + period] - - The `period` and `offset` arguments are created either from a timedelta, or - by using the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a rolling operation on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - group_by_dynamic - - Examples - -------- - >>> dates = [ - ... "2020-01-01 13:45:48", - ... "2020-01-01 16:42:13", - ... "2020-01-01 16:45:09", - ... "2020-01-02 18:12:48", - ... "2020-01-03 19:45:32", - ... "2020-01-08 23:16:43", - ... ] - >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( - ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() - ... ) - >>> out = ( - ... df.rolling(index_column="dt", period="2d") - ... .agg( - ... pl.sum("a").alias("sum_a"), - ... pl.min("a").alias("min_a"), - ... pl.max("a").alias("max_a"), - ... ) - ... .collect() - ... ) - >>> out - shape: (6, 4) - ┌─────────────────────┬───────┬───────┬───────┐ - │ dt ┆ sum_a ┆ min_a ┆ max_a │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞═════════════════════╪═══════╪═══════╪═══════╡ - │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ - │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ - │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ - │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ - │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ - │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ - └─────────────────────┴───────┴───────┴───────┘ - - ''' - def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - Time windows are calculated and rows are assigned to windows. Different from a - normal group by is that a row can be member of multiple groups. - By default, the windows look like: - - - [start, start + period) - - [start + every, start + every + period) - - [start + 2*every, start + 2*every + period) - - ... - - where `start` is determined by `start_by`, `offset`, and `every` (see parameter - descriptions below). - - .. warning:: - The index column must be sorted in ascending order. If `by` is passed, then - the index column must be sorted in ascending order within each group. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - - .. deprecated:: 0.19.4 - Use `label` instead. - include_boundaries - Add the lower and upper bound of the window to the "_lower_boundary" and - "_upper_boundary" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - label : {\'left\', \'right\', \'datapoint\'} - Define which label to use for the window: - - - \'left\': lower boundary of the window - - \'right\': upper boundary of the window - - \'datapoint\': the first value of the index column in the given window. - If you don\'t need the label to be at one of the boundaries, choose this - option for maximum performance - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - rolling - - Notes - ----- - 1) If you\'re coming from pandas, then - - .. code-block:: python - - # polars - df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) - - is equivalent to - - .. code-block:: python - - # pandas - df.set_index("ts").resample("D")["value"].sum().reset_index() - - though note that, unlike pandas, polars doesn\'t add extra rows for empty - windows. If you need `index_column` to be evenly spaced, then please combine - with :func:`DataFrame.upsample`. - - 2) The `every`, `period` and `offset` arguments are created with - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a group_by_dynamic on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Examples - -------- - >>> from datetime import datetime - >>> lf = pl.LazyFrame( - ... { - ... "time": pl.datetime_range( - ... start=datetime(2021, 12, 16), - ... end=datetime(2021, 12, 16, 3), - ... interval="30m", - ... eager=True, - ... ), - ... "n": range(7), - ... } - ... ) - >>> lf.collect() - shape: (7, 2) - ┌─────────────────────┬─────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i64 │ - ╞═════════════════════╪═════╡ - │ 2021-12-16 00:00:00 ┆ 0 │ - │ 2021-12-16 00:30:00 ┆ 1 │ - │ 2021-12-16 01:00:00 ┆ 2 │ - │ 2021-12-16 01:30:00 ┆ 3 │ - │ 2021-12-16 02:00:00 ┆ 4 │ - │ 2021-12-16 02:30:00 ┆ 5 │ - │ 2021-12-16 03:00:00 ┆ 6 │ - └─────────────────────┴─────┘ - - Group by windows of 1 hour starting at 2021-12-16 00:00:00. - - >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( - ... pl.col("n") - ... ).collect() - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [1, 2] │ - │ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ 2021-12-16 02:00:00 ┆ [5, 6] │ - └─────────────────────┴───────────┘ - - The window boundaries can also be added to the aggregation result - - >>> lf.group_by_dynamic( - ... "time", every="1h", include_boundaries=True, closed="right" - ... ).agg(pl.col("n").mean()).collect() - shape: (4, 4) - ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ - │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ - ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ - │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ - │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ - │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ - │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ - └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ - - When closed="left", the window excludes the right end of interval: - [lower_bound, upper_bound) - - >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( - ... pl.col("n") - ... ).collect() - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-16 00:00:00 ┆ [0, 1] │ - │ 2021-12-16 01:00:00 ┆ [2, 3] │ - │ 2021-12-16 02:00:00 ┆ [4, 5] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - When closed="both" the time values at the window boundaries belong to 2 groups. - - >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( - ... pl.col("n") - ... ).collect() - shape: (5, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ - │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - Dynamic group bys can also be combined with grouping on normal keys - - >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) - >>> lf.collect() - shape: (7, 3) - ┌─────────────────────┬─────┬────────┐ - │ time ┆ n ┆ groups │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ str │ - ╞═════════════════════╪═════╪════════╡ - │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ - │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ - │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ - │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ - │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ - │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ - │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ - └─────────────────────┴─────┴────────┘ - >>> lf.group_by_dynamic( - ... "time", - ... every="1h", - ... closed="both", - ... by="groups", - ... include_boundaries=True, - ... ).agg(pl.col("n")).collect() - shape: (7, 5) - ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ - │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ - ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ - │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ - │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ - │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ - │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ - │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ - └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ - - Dynamic group by on an index column - - >>> lf = pl.LazyFrame( - ... { - ... "idx": pl.int_range(0, 6, eager=True), - ... "A": ["A", "A", "B", "B", "B", "C"], - ... } - ... ) - >>> lf.group_by_dynamic( - ... "idx", - ... every="2i", - ... period="3i", - ... include_boundaries=True, - ... closed="right", - ... ).agg(pl.col("A").alias("A_agg_list")).collect() - shape: (4, 4) - ┌─────────────────┬─────────────────┬─────┬─────────────────┐ - │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 ┆ list[str] │ - ╞═════════════════╪═════════════════╪═════╪═════════════════╡ - │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ - │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ - │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ - │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ - └─────────────────┴─────────────────┴─────┴─────────────────┘ - - ''' - def join_asof(self, other: LazyFrame) -> Self: - ''' - Perform an asof join. - - This is similar to a left-join except that we match on nearest key rather than - equal keys. - - Both DataFrames must be sorted by the join_asof key. - - For each row in the left DataFrame: - - - A "backward" search selects the last row in the right DataFrame whose - \'on\' key is less than or equal to the left\'s key. - - - A "forward" search selects the first row in the right DataFrame whose - \'on\' key is greater than or equal to the left\'s key. - - A "nearest" search selects the last row in the right DataFrame whose value - is nearest to the left\'s key. String keys are not currently supported for a - nearest search. - - The default is "backward". - - Parameters - ---------- - other - Lazy DataFrame to join with. - left_on - Join column of the left DataFrame. - right_on - Join column of the right DataFrame. - on - Join column of both DataFrames. If set, `left_on` and `right_on` should be - None. - by - Join on these columns before doing asof join. - by_left - Join on these columns before doing asof join. - by_right - Join on these columns before doing asof join. - strategy : {\'backward\', \'forward\', \'nearest\'} - Join strategy. - suffix - Suffix to append to columns with a duplicate name. - tolerance - Numeric tolerance. By setting this the join will only be done if the near - keys are within this distance. If an asof join is done on columns of dtype - "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta - object or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - allow_parallel - Allow the physical plan to optionally evaluate the computation of both - DataFrames up to the join in parallel. - force_parallel - Force the physical plan to evaluate the computation of both DataFrames up to - the join in parallel. - - Examples - -------- - >>> from datetime import datetime - >>> gdp = pl.LazyFrame( - ... { - ... "date": [ - ... datetime(2016, 1, 1), - ... datetime(2017, 1, 1), - ... datetime(2018, 1, 1), - ... datetime(2019, 1, 1), - ... ], # note record date: Jan 1st (sorted!) - ... "gdp": [4164, 4411, 4566, 4696], - ... } - ... ).set_sorted("date") - >>> population = pl.LazyFrame( - ... { - ... "date": [ - ... datetime(2016, 5, 12), - ... datetime(2017, 5, 12), - ... datetime(2018, 5, 12), - ... datetime(2019, 5, 12), - ... ], # note record date: May 12th (sorted!) - ... "population": [82.19, 82.66, 83.12, 83.52], - ... } - ... ).set_sorted("date") - >>> population.join_asof(gdp, on="date", strategy="backward").collect() - shape: (4, 3) - ┌─────────────────────┬────────────┬──────┐ - │ date ┆ population ┆ gdp │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ f64 ┆ i64 │ - ╞═════════════════════╪════════════╪══════╡ - │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ - │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ - │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ - │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ - └─────────────────────┴────────────┴──────┘ - - ''' - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: - ''' - Add a join operation to the Logical Plan. - - Parameters - ---------- - other - Lazy DataFrame to join with. - on - Join column of both DataFrames. If set, `left_on` and `right_on` should be - None. - how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} - Join strategy. - - .. note:: - A left join preserves the row order of the left DataFrame. - left_on - Join column of the left DataFrame. - right_on - Join column of the right DataFrame. - suffix - Suffix to append to columns with a duplicate name. - validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} - Checks if join is of specified type. - - * *many_to_many* - “m:m”: default, does not result in checks - * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets - * *one_to_many* - “1:m”: check if join keys are unique in left dataset - * *many_to_one* - “m:1”: check if join keys are unique in right dataset - - .. note:: - - - This is currently not supported the streaming engine. - - This is only supported when joined by single columns. - allow_parallel - Allow the physical plan to optionally evaluate the computation of both - DataFrames up to the join in parallel. - force_parallel - Force the physical plan to evaluate the computation of both DataFrames up to - the join in parallel. - - See Also - -------- - join_asof - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> other_lf = pl.LazyFrame( - ... { - ... "apple": ["x", "y", "z"], - ... "ham": ["a", "b", "d"], - ... } - ... ) - >>> lf.join(other_lf, on="ham").collect() - shape: (2, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - └─────┴─────┴─────┴───────┘ - >>> lf.join(other_lf, on="ham", how="outer").collect() - shape: (4, 4) - ┌──────┬──────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞══════╪══════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ null ┆ null ┆ d ┆ z │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └──────┴──────┴─────┴───────┘ - >>> lf.join(other_lf, on="ham", how="left").collect() - shape: (3, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └─────┴─────┴─────┴───────┘ - >>> lf.join(other_lf, on="ham", how="semi").collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 7.0 ┆ b │ - └─────┴─────┴─────┘ - >>> lf.join(other_lf, on="ham", how="anti").collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - ''' - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - LazyFrame - A new LazyFrame with the columns added. - - Notes - ----- - Creating a new LazyFrame using this method does not create a new copy of - existing data. - - Examples - -------- - Pass an expression to add it as a new column. - - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() - shape: (4, 4) - ┌─────┬──────┬───────┬──────┐ - │ a ┆ b ┆ c ┆ a^2 │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 │ - ╞═════╪══════╪═══════╪══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ - └─────┴──────┴───────┴──────┘ - - Added columns will replace existing columns with the same name. - - >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() - shape: (4, 3) - ┌─────┬──────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╡ - │ 1.0 ┆ 0.5 ┆ true │ - │ 2.0 ┆ 4.0 ┆ true │ - │ 3.0 ┆ 10.0 ┆ false │ - │ 4.0 ┆ 13.0 ┆ true │ - └─────┴──────┴───────┘ - - Multiple columns can be added by passing a list of expressions. - - >>> lf.with_columns( - ... [ - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ] - ... ).collect() - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Multiple columns also can be added using positional arguments instead of a list. - - >>> lf.with_columns( - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ).collect() - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> lf.with_columns( - ... ab=pl.col("a") * pl.col("b"), - ... not_c=pl.col("c").not_(), - ... ).collect() - shape: (4, 5) - ┌─────┬──────┬───────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ ab ┆ not_c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ - └─────┴──────┴───────┴──────┴───────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... lf.drop("c").with_columns( - ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), - ... ).collect() - ... - shape: (4, 3) - ┌─────┬──────┬─────────────┐ - │ a ┆ b ┆ diffs │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ struct[2] │ - ╞═════╪══════╪═════════════╡ - │ 1 ┆ 0.5 ┆ {null,null} │ - │ 2 ┆ 4.0 ┆ {1,3.5} │ - │ 3 ┆ 10.0 ┆ {1,6.0} │ - │ 4 ┆ 13.0 ┆ {1,3.0} │ - └─────┴──────┴─────────────┘ - - ''' - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - """ - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - LazyFrame - A new LazyFrame with the columns added. - - See Also - -------- - with_columns - - """ - def with_context(self, other: Self | list[Self]) -> Self: - ''' - Add an external context to the computation graph. - - This allows expressions to also access columns from DataFrames - that are not part of this one. - - Parameters - ---------- - other - Lazy DataFrame to join with. - - Examples - -------- - >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) - >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) - >>> lf.with_context(lf_other).select( - ... pl.col("b") + pl.col("c").first() - ... ).collect() - shape: (3, 1) - ┌──────┐ - │ b │ - │ --- │ - │ str │ - ╞══════╡ - │ afoo │ - │ cfoo │ - │ null │ - └──────┘ - - Fill nulls with the median from another DataFrame: - - >>> train_lf = pl.LazyFrame( - ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} - ... ) - >>> test_lf = pl.LazyFrame( - ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} - ... ) - >>> test_lf.with_context( - ... train_lf.select(pl.all().name.suffix("_train")) - ... ).select( - ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) - ... ).collect() - shape: (3, 1) - ┌───────────┐ - │ feature_0 │ - │ --- │ - │ f64 │ - ╞═══════════╡ - │ -1.0 │ - │ 0.0 │ - │ 1.0 │ - └───────────┘ - - ''' - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: - ''' - Remove columns from the DataFrame. - - Parameters - ---------- - columns - Name of the column(s) that should be removed from the DataFrame. - *more_columns - Additional columns to drop, specified as positional arguments. - - Examples - -------- - Drop a single column by passing the name of that column. - - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.drop("ham").collect() - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪═════╡ - │ 1 ┆ 6.0 │ - │ 2 ┆ 7.0 │ - │ 3 ┆ 8.0 │ - └─────┴─────┘ - - Drop multiple columns by passing a selector. - - >>> import polars.selectors as cs - >>> lf.drop(cs.numeric()).collect() - shape: (3, 1) - ┌─────┐ - │ ham │ - │ --- │ - │ str │ - ╞═════╡ - │ a │ - │ b │ - │ c │ - └─────┘ - - Use positional arguments to drop multiple columns. - - >>> lf.drop("foo", "ham").collect() - shape: (3, 1) - ┌─────┐ - │ bar │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 6.0 │ - │ 7.0 │ - │ 8.0 │ - └─────┘ - - ''' - def rename(self, mapping: dict[str, str]) -> Self: - ''' - Rename column names. - - Parameters - ---------- - mapping - Key value pairs that map from old name to new name. - - Notes - ----- - If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), - polars will block projection and predicate pushdowns at this node. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.rename({"foo": "apple"}).collect() - shape: (3, 3) - ┌───────┬─────┬─────┐ - │ apple ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═══════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └───────┴─────┴─────┘ - - ''' - def reverse(self) -> Self: - ''' - Reverse the DataFrame. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "key": ["a", "b", "c"], - ... "val": [1, 2, 3], - ... } - ... ) - >>> lf.reverse().collect() - shape: (3, 2) - ┌─────┬─────┐ - │ key ┆ val │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ c ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 1 │ - └─────┴─────┘ - - ''' - def shift(self, n: int | IntoExprColumn = ...) -> Self: - ''' - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. Accepts expression input. - Non-expression inputs are parsed as literals. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [5, 6, 7, 8], - ... } - ... ) - >>> lf.shift().collect() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ null ┆ null │ - │ 1 ┆ 5 │ - │ 2 ┆ 6 │ - │ 3 ┆ 7 │ - └──────┴──────┘ - - Pass a negative value to shift in the opposite direction instead. - - >>> lf.shift(-2).collect() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ null ┆ null │ - │ null ┆ null │ - └──────┴──────┘ - - Specify `fill_value` to fill the resulting null values. - - >>> lf.shift(-2, fill_value=100).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ 100 ┆ 100 │ - │ 100 ┆ 100 │ - └─────┴─────┘ - - ''' - def slice(self, offset: int, length: int | None = ...) -> Self: - ''' - Get a slice of this DataFrame. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["x", "y", "z"], - ... "b": [1, 3, 5], - ... "c": [2, 4, 6], - ... } - ... ) - >>> lf.slice(1, 2).collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ y ┆ 3 ┆ 4 │ - │ z ┆ 5 ┆ 6 │ - └─────┴─────┴─────┘ - - ''' - def limit(self, n: int = ...) -> Self: - ''' - Get the first `n` rows. - - Alias for :func:`LazyFrame.head`. - - Parameters - ---------- - n - Number of rows to return. - - Notes - ----- - Consider using the :func:`fetch` operation if you only want to test your - query. The :func:`fetch` operation will load the first `n` rows at the scan - level, whereas the :func:`head`/:func:`limit` are applied at the end. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4, 5, 6], - ... "b": [7, 8, 9, 10, 11, 12], - ... } - ... ) - >>> lf.limit().collect() - shape: (5, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - │ 4 ┆ 10 │ - │ 5 ┆ 11 │ - └─────┴─────┘ - >>> lf.limit(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - └─────┴─────┘ - - ''' - def head(self, n: int = ...) -> Self: - ''' - Get the first `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Notes - ----- - Consider using the :func:`fetch` operation if you only want to test your - query. The :func:`fetch` operation will load the first `n` rows at the scan - level, whereas the :func:`head`/:func:`limit` are applied at the end. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4, 5, 6], - ... "b": [7, 8, 9, 10, 11, 12], - ... } - ... ) - >>> lf.head().collect() - shape: (5, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - │ 4 ┆ 10 │ - │ 5 ┆ 11 │ - └─────┴─────┘ - >>> lf.head(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - └─────┴─────┘ - - ''' - def tail(self, n: int = ...) -> Self: - ''' - Get the last `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4, 5, 6], - ... "b": [7, 8, 9, 10, 11, 12], - ... } - ... ) - >>> lf.tail().collect() - shape: (5, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - │ 4 ┆ 10 │ - │ 5 ┆ 11 │ - │ 6 ┆ 12 │ - └─────┴─────┘ - >>> lf.tail(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 5 ┆ 11 │ - │ 6 ┆ 12 │ - └─────┴─────┘ - - ''' - def last(self) -> Self: - ''' - Get the last row of the DataFrame. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> lf.last().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 5 ┆ 6 │ - └─────┴─────┘ - - ''' - def first(self) -> Self: - ''' - Get the first row of the DataFrame. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> lf.first().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 2 │ - └─────┴─────┘ - - ''' - def approx_n_unique(self) -> Self: - ''' - Approximate count of unique values. - - This is done using the HyperLogLog++ algorithm for cardinality estimation. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.approx_n_unique().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def approx_unique(self) -> Self: - """ - Approximate count of unique values. - - .. deprecated:: 0.18.12 - This method has been renamed to :func:`LazyFrame.approx_n_unique`. - - """ - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: - ''' - Add a column at index 0 that counts the rows. - - Parameters - ---------- - name - Name of the column to add. - offset - Start the row count at this offset. - - Warnings - -------- - This can have a negative effect on query performance. - This may, for instance, block predicate pushdown optimization. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> lf.with_row_count().collect() - shape: (3, 3) - ┌────────┬─────┬─────┐ - │ row_nr ┆ a ┆ b │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ i64 ┆ i64 │ - ╞════════╪═════╪═════╡ - │ 0 ┆ 1 ┆ 2 │ - │ 1 ┆ 3 ┆ 4 │ - │ 2 ┆ 5 ┆ 6 │ - └────────┴─────┴─────┘ - - ''' - def gather_every(self, n: int) -> Self: - ''' - Take every nth row in the LazyFrame and return as a new LazyFrame. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [5, 6, 7, 8], - ... } - ... ) - >>> lf.gather_every(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 5 │ - │ 3 ┆ 7 │ - └─────┴─────┘ - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: - ''' - Fill null values using the specified value or strategy. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - matches_supertype - Fill all matching supertypes of the fill `value` literal. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, None, 4], - ... "b": [0.5, 4, None, 13], - ... } - ... ) - >>> lf.fill_null(99).collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 99 ┆ 99.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - >>> lf.fill_null(strategy="forward").collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> lf.fill_null(strategy="max").collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> lf.fill_null(strategy="zero").collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 0 ┆ 0.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - ''' - def fill_nan(self, value: int | float | Expr | None) -> Self: - ''' - Fill floating point NaN values. - - Parameters - ---------- - value - Value to fill the NaN values with. - - Warnings - -------- - Note that floating point NaN (Not a Number) are not missing values! - To replace missing values, use :func:`fill_null` instead. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1.5, 2, float("nan"), 4], - ... "b": [0.5, 4, float("nan"), 13], - ... } - ... ) - >>> lf.fill_nan(99).collect() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪══════╡ - │ 1.5 ┆ 0.5 │ - │ 2.0 ┆ 4.0 │ - │ 99.0 ┆ 99.0 │ - │ 4.0 ┆ 13.0 │ - └──────┴──────┘ - - ''' - def std(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns in the LazyFrame to their standard deviation value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.std().collect() - shape: (1, 2) - ┌──────────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════════╪═════╡ - │ 1.290994 ┆ 0.5 │ - └──────────┴─────┘ - >>> lf.std(ddof=0).collect() - shape: (1, 2) - ┌──────────┬──────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════════╪══════════╡ - │ 1.118034 ┆ 0.433013 │ - └──────────┴──────────┘ - - ''' - def var(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns in the LazyFrame to their variance value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.var().collect() - shape: (1, 2) - ┌──────────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════════╪══════╡ - │ 1.666667 ┆ 0.25 │ - └──────────┴──────┘ - >>> lf.var(ddof=0).collect() - shape: (1, 2) - ┌──────┬────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪════════╡ - │ 1.25 ┆ 0.1875 │ - └──────┴────────┘ - - ''' - def max(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their maximum value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.max().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def min(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their minimum value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.min().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 1 │ - └─────┴─────┘ - - ''' - def sum(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their sum value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.sum().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 10 ┆ 5 │ - └─────┴─────┘ - - ''' - def mean(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their mean value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.mean().collect() - shape: (1, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════╡ - │ 2.5 ┆ 1.25 │ - └─────┴──────┘ - - ''' - def median(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their median value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.median().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 2.5 ┆ 1.0 │ - └─────┴─────┘ - - ''' - def null_count(self) -> Self: - ''' - Aggregate the columns in the LazyFrame as the sum of their null value count. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, None, 3], - ... "bar": [6, 7, None], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.null_count().collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ u32 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 1 ┆ 0 │ - └─────┴─────┴─────┘ - - ''' - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: - ''' - Aggregate the columns in the LazyFrame to their quantile value. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.quantile(0.7).collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 3.0 ┆ 1.0 │ - └─────┴─────┘ - - ''' - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: - ''' - Explode the DataFrame to long format by exploding the given columns. - - Parameters - ---------- - columns - Column names, expressions, or a selector defining them. The underlying - columns being exploded must be of List or Utf8 datatype. - *more_columns - Additional names of columns to explode, specified as positional arguments. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "letters": ["a", "a", "b", "c"], - ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], - ... } - ... ) - >>> lf.explode("numbers").collect() - shape: (8, 2) - ┌─────────┬─────────┐ - │ letters ┆ numbers │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════════╪═════════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ a ┆ 3 │ - │ b ┆ 4 │ - │ b ┆ 5 │ - │ c ┆ 6 │ - │ c ┆ 7 │ - │ c ┆ 8 │ - └─────────┴─────────┘ - - ''' - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: - ''' - Drop duplicate rows from this DataFrame. - - Parameters - ---------- - subset - Column name(s) or selector(s), to consider when identifying - duplicate rows. If set to `None` (default), use all columns. - keep : {\'first\', \'last\', \'any\', \'none\'} - Which of the duplicate rows to keep. - - * \'any\': Does not give any guarantee of which row is kept. - This allows more optimizations. - * \'none\': Don\'t keep duplicate rows. - * \'first\': Keep first unique row. - * \'last\': Keep last unique row. - maintain_order - Keep the same order as the original DataFrame. This is more expensive to - compute. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - Returns - ------- - LazyFrame - LazyFrame with unique rows. - - Warnings - -------- - This method will fail if there is a column of type `List` in the DataFrame or - subset. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3, 1], - ... "bar": ["a", "a", "a", "a"], - ... "ham": ["b", "b", "b", "b"], - ... } - ... ) - >>> lf.unique(maintain_order=True).collect() - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> lf.unique(keep="last", maintain_order=True).collect() - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - - ''' - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: - ''' - Drop all rows that contain null values. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - subset - Column name(s) for which null values are considered. - If set to `None` (default), use all columns. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, None, 8], - ... "ham": ["a", "b", None], - ... } - ... ) - - The default behavior of this method is to drop rows where any single - value of the row is null. - - >>> lf.drop_nulls().collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - This behaviour can be constrained to consider only a subset of columns, as - defined by name or with a selector. For example, dropping rows if there is - a null in any of the integer columns: - - >>> import polars.selectors as cs - >>> lf.drop_nulls(subset=cs.integer()).collect() - shape: (2, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ null │ - └─────┴─────┴──────┘ - - This method drops a row if any single value of the row is null. - - Below are some example snippets that show how you could drop null - values based on other conditions: - - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, None, None, None], - ... "b": [1, 2, None, 1], - ... "c": [1, None, None, 1], - ... } - ... ) - >>> lf.collect() - shape: (4, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪══════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ null ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴──────┴──────┘ - - Drop a row only if all values are null: - - >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() - shape: (3, 3) - ┌──────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪═════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴─────┴──────┘ - - ''' - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: - ''' - Unpivot a DataFrame from wide to long format. - - Optionally leaves identifiers set. - - This function is useful to massage a DataFrame into a format where one or more - columns are identifier variables (id_vars) while all other columns, considered - measured variables (value_vars), are "unpivoted" to the row axis leaving just - two non-identifier columns, \'variable\' and \'value\'. - - Parameters - ---------- - id_vars - Column(s) or selector(s) to use as identifier variables. - value_vars - Column(s) or selector(s) to use as values variables; if `value_vars` - is empty all columns that are not in `id_vars` will be used. - variable_name - Name to give to the `variable` column. Defaults to "variable" - value_name - Name to give to the `value` column. Defaults to "value" - streamable - Allow this node to run in the streaming engine. - If this runs in streaming, the output of the melt operation - will not have a stable ordering. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["x", "y", "z"], - ... "b": [1, 3, 5], - ... "c": [2, 4, 6], - ... } - ... ) - >>> import polars.selectors as cs - >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() - shape: (6, 3) - ┌─────┬──────────┬───────┐ - │ a ┆ variable ┆ value │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 │ - ╞═════╪══════════╪═══════╡ - │ x ┆ b ┆ 1 │ - │ y ┆ b ┆ 3 │ - │ z ┆ b ┆ 5 │ - │ x ┆ c ┆ 2 │ - │ y ┆ c ┆ 4 │ - │ z ┆ c ┆ 6 │ - └─────┴──────────┴───────┘ - - ''' - def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: - ''' - Apply a custom function. - - It is important that the function returns a Polars DataFrame. - - Parameters - ---------- - function - Lambda/ function to apply. - predicate_pushdown - Allow predicate pushdown optimization to pass this node. - projection_pushdown - Allow projection pushdown optimization to pass this node. - slice_pushdown - Allow slice pushdown optimization to pass this node. - no_optimizations - Turn off all optimizations past this point. - schema - Output schema of the function, if set to `None` we assume that the schema - will remain unchanged by the applied function. - validate_output_schema - It is paramount that polars\' schema is correct. This flag will ensure that - the output schema of this function will be checked with the expected schema. - Setting this to `False` will not do this check, but may lead to hard to - debug bugs. - streamable - Whether the function that is given is eligible to be running with the - streaming engine. That means that the function must produce the same result - when it is executed in batches or when it is be executed on the full - dataset. - - Warnings - -------- - The `schema` of a `LazyFrame` must always be correct. It is up to the caller - of this function to ensure that this invariant is upheld. - - It is important that the optimization flags are correct. If the custom function - for instance does an aggregation of a column, `predicate_pushdown` should not - be allowed, as this prunes rows and will influence your aggregation results. - - Examples - -------- - >>> lf = ( # doctest: +SKIP - ... pl.LazyFrame( - ... { - ... "a": pl.int_range(-100_000, 0, eager=True), - ... "b": pl.int_range(0, 100_000, eager=True), - ... } - ... ) - ... .map_batches(lambda x: 2 * x, streamable=True) - ... .collect(streaming=True) - ... ) - shape: (100_000, 2) - ┌─────────┬────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════════╪════════╡ - │ -200000 ┆ 0 │ - │ -199998 ┆ 2 │ - │ -199996 ┆ 4 │ - │ -199994 ┆ 6 │ - │ … ┆ … │ - │ -8 ┆ 199992 │ - │ -6 ┆ 199994 │ - │ -4 ┆ 199996 │ - │ -2 ┆ 199998 │ - └─────────┴────────┘ - - ''' - def interpolate(self) -> Self: - ''' - Interpolate intermediate values. The interpolation method is linear. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, None, 9, 10], - ... "bar": [6, 7, 9, None], - ... "baz": [1, None, None, 9], - ... } - ... ) - >>> lf.interpolate().collect() - shape: (4, 3) - ┌──────┬──────┬──────────┐ - │ foo ┆ bar ┆ baz │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 │ - ╞══════╪══════╪══════════╡ - │ 1.0 ┆ 6.0 ┆ 1.0 │ - │ 5.0 ┆ 7.0 ┆ 3.666667 │ - │ 9.0 ┆ 9.0 ┆ 6.333333 │ - │ 10.0 ┆ null ┆ 9.0 │ - └──────┴──────┴──────────┘ - - ''' - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: - ''' - Decompose struct columns into separate columns for each of their fields. - - The new columns will be inserted into the DataFrame at the location of the - struct column. - - Parameters - ---------- - columns - Name of the struct column(s) that should be unnested. - *more_columns - Additional columns to unnest, specified as positional arguments. - - Examples - -------- - >>> df = pl.LazyFrame( - ... { - ... "before": ["foo", "bar"], - ... "t_a": [1, 2], - ... "t_b": ["a", "b"], - ... "t_c": [True, None], - ... "t_d": [[1, 2], [3]], - ... "after": ["baz", "womp"], - ... } - ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") - >>> df.collect() - shape: (2, 3) - ┌────────┬─────────────────────┬───────┐ - │ before ┆ t_struct ┆ after │ - │ --- ┆ --- ┆ --- │ - │ str ┆ struct[4] ┆ str │ - ╞════════╪═════════════════════╪═══════╡ - │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ - │ bar ┆ {2,"b",null,[3]} ┆ womp │ - └────────┴─────────────────────┴───────┘ - >>> df.unnest("t_struct").collect() - shape: (2, 6) - ┌────────┬─────┬─────┬──────┬───────────┬───────┐ - │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ - ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ - │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ - │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ - └────────┴─────┴─────┴──────┴───────────┴───────┘ - - ''' - def merge_sorted(self, other: LazyFrame, key: str) -> Self: - ''' - Take two sorted DataFrames and merge them by the sorted key. - - The output of this operation will also be sorted. - It is the callers responsibility that the frames are sorted - by that key otherwise the output will not make sense. - - The schemas of both LazyFrames must be equal. - - Parameters - ---------- - other - Other DataFrame that must be merged - key - Key that is sorted. - - Examples - -------- - >>> df0 = pl.LazyFrame( - ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} - ... ).sort("age") - >>> df0.collect() - shape: (3, 2) - ┌───────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═══════╪═════╡ - │ bob ┆ 18 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └───────┴─────┘ - >>> df1 = pl.LazyFrame( - ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} - ... ).sort("age") - >>> df1.collect() - shape: (4, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - └────────┴─────┘ - >>> df0.merge_sorted(df1, key="age").collect() - shape: (7, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ bob ┆ 18 │ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └────────┴─────┘ - ''' - def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: - """ - Indicate that one or multiple columns are sorted. - - Parameters - ---------- - column - Columns that are sorted - more_columns - Additional columns that are sorted, specified as positional arguments. - descending - Whether the columns are sorted in descending order. - """ - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: - ''' - Update the values in this `LazyFrame` with the non-null values in `other`. - - Parameters - ---------- - other - LazyFrame that will be used to update the values - on - Column names that will be joined on; if given `None` the implicit row - index is used as a join key instead. - left_on - Join column(s) of the left DataFrame. - right_on - Join column(s) of the right DataFrame. - how : {\'left\', \'inner\', \'outer\'} - * \'left\' will keep all rows from the left table; rows may be duplicated - if multiple rows in the right frame match the left row\'s key. - * \'inner\' keeps only those rows where the key exists in both frames. - * \'outer\' will update existing rows where the key matches while also - adding any new rows contained in the given frame. - include_nulls - If True, null values from the right DataFrame will be used to update the - left DataFrame. - - Notes - ----- - This is syntactic sugar for a left/inner join, with an optional coalesce when - `include_nulls = False`. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "A": [1, 2, 3, 4], - ... "B": [400, 500, 600, 700], - ... } - ... ) - >>> lf.collect() - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 400 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - >>> new_lf = pl.LazyFrame( - ... { - ... "B": [-66, None, -99], - ... "C": [5, 3, 1], - ... } - ... ) - - Update `df` values with the non-null values in `new_df`, by row index: - - >>> lf.update(new_lf).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, by row index, - but only keeping those rows that are common to both frames: - - >>> lf.update(new_lf, how="inner").collect() - shape: (3, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() - shape: (5, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴─────┘ - - Update `df` values including null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> lf.update( - ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True - ... ).collect() - shape: (5, 2) - ┌─────┬──────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ null │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴──────┘ - - ''' - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: - """ - Start a group by operation. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.group_by`. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - """ - def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - """ - def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.9 - This method has been renamed to :func:`LazyFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - """ - def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.group_by_dynamic`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - include_boundaries - Add the lower and upper bound of the window to the "_lower_bound" and - "_upper_bound" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - ''' - def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: - """ - Apply a custom function. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.map_batches`. - - Parameters - ---------- - function - Lambda/ function to apply. - predicate_pushdown - Allow predicate pushdown optimization to pass this node. - projection_pushdown - Allow projection pushdown optimization to pass this node. - slice_pushdown - Allow slice pushdown optimization to pass this node. - no_optimizations - Turn off all optimizations past this point. - schema - Output schema of the function, if set to `None` we assume that the schema - will remain unchanged by the applied function. - validate_output_schema - It is paramount that polars' schema is correct. This flag will ensure that - the output schema of this function will be checked with the expected schema. - Setting this to `False` will not do this check, but may lead to hard to - debug bugs. - streamable - Whether the function that is given is eligible to be running with the - streaming engine. That means that the function must produce the same result - when it is executed in batches or when it is be executed on the full - dataset. - - """ - def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - fill None values with the result of this expression. - n - Number of places to shift (may be negative). - - """ - def take_every(self, n: int) -> Self: - """ - Take every nth row in the LazyFrame and return as a new LazyFrame. - - .. deprecated:: 0.19.0 - This method has been renamed to :meth:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - @property - def columns(self): ... - @property - def dtypes(self): ... - @property - def schema(self): ... - @property - def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..66c741b --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/lazyframe/frame.pyi @@ -0,0 +1,4174 @@ +#: version 0.20.2 +import P +import np +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use `pl.scan_csv` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use `pl.scan_parquet` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use `pl.scan_ipc` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use `pl.scan_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + ... + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to `True`. + If this is set to `True` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame: + ''' + Materialize this LazyFrame into a DataFrame. + + By default, all query optimizations are enabled. Individual optimizations may + be disabled by setting the corresponding parameter to `False`. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + no_optimization + Turn off (certain) optimizations. + streaming + Process the query in batches to handle larger-than-memory data. + If set to `False` (default), the entire query is processed in a single + batch. + + .. warning:: + This functionality is currently in an alpha state. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + + Returns + ------- + DataFrame + + See Also + -------- + fetch: Run the query on the first `n` rows only for debugging purposes. + explain : Print the query plan that is evaluated with collect. + profile : Collect the LazyFrame and time each node in the computation graph. + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.Config.set_streaming_chunk_size : Set the size of streaming batches. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + Collect in streaming mode + + >>> lf.group_by("a").agg(pl.all().sum()).collect( + ... streaming=True + ... ) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + DataFrame directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + ... + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a Parquet file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an IPC file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a CSV file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the + separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def sink_ndjson(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_json("out.json") # doctest: +SKIP + + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that `fetch` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if `n_rows` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.Utf8).collect().to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this LazyFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters. Use name=value to filter column name by the supplied value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") > 1).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> lf.filter( + ... pl.col("foo") == 1, + ... pl.col("ham") == "a", + ... ).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> lf.filter(foo=1, ham="a").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ).collect() + ... + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Setting this to `True` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `dynamic_group_by` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.rolling(index_column="dt", period="2d") + ... .agg( + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ) + ... .collect() + ... ) + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\', \'outer_coalesce\'} + Join strategy. + + * *inner* + Returns rows that have matching values in both tables + * *left* + Returns all rows from the left table, and the matched rows from the + right table + * *outer* + Returns all rows when there is a match in either left or right table + * *outer_coalesce* + Same as \'outer\', but coalesces the key columns + * *cross* + Returns the cartisian product of rows from both tables + * *semi* + Filter rows that have a match in the right table. + * *anti* + Filter rows that not have a match in the right table. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + join_nulls + Join on null values. By default null values will never produce matches. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 5) + ┌──────┬──────┬──────┬───────┬───────────┐ + │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞══════╪══════╪══════╪═══════╪═══════════╡ + │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │ + │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │ + │ null ┆ null ┆ null ┆ z ┆ d │ + │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │ + └──────┴──────┴──────┴───────┴───────────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ).collect() + ... + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another DataFrame: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context( + ... train_lf.select(pl.all().name.suffix("_train")) + ... ).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the DataFrame. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the DataFrame. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), + polars will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.shift().collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> lf.shift(-2).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> lf.shift(-2, fill_value=100).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def gather_every(self, n: int) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.gather_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill `value` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the DataFrame to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or Utf8 datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this DataFrame. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The `schema` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, `predicate_pushdown` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the DataFrame at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + .. warning:: + This functionality is experimental and may change without it being + considered a breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on; if given `None` the implicit row + index is used as a join key instead. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + include_nulls + If True, null values from the right DataFrame will be used to update the + left DataFrame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> lf.collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_lf = pl.LazyFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> lf.update(new_lf).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> lf.update(new_lf, how="inner").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update( + ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... ).collect() + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int) -> Self: + """ + Take every nth row in the LazyFrame and return as a new LazyFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/series/series deleted file mode 100644 index 4a40006..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/series/series +++ /dev/null @@ -1,4988 +0,0 @@ -import np as np -import pa as pa -import pd as pd -from builtins import PySeries -from datetime import date, datetime, timedelta -from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Object as Object, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown, Utf8 as Utf8 -from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat -from polars.exceptions import ShapeError as ShapeError -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence - -TYPE_CHECKING: bool -_PYARROW_AVAILABLE: bool - -class Series: - _s: _ClassVar[None] = ... - _accessors: _ClassVar[set] = ... - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array) -> Self: - """Construct a Series from an Arrow Array.""" - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: - """Construct a Series from a pandas Series or DatetimeIndex.""" - def _get_ptr(self) -> tuple[int, int, int]: - """ - Get a pointer to the start of the values buffer of a numeric Series. - - This will raise an error if the `Series` contains multiple chunks. - - This will return the offset, length and the pointer itself. - - """ - def __bool__(self) -> NoReturn: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Series: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... - def __eq__(self, other: Any) -> Series | Expr: ... - def __ne__(self, other: Any) -> Series | Expr: ... - def __gt__(self, other: Any) -> Series | Expr: ... - def __lt__(self, other: Any) -> Series | Expr: ... - def __ge__(self, other: Any) -> Series | Expr: ... - def __le__(self, other: Any) -> Series | Expr: ... - def le(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series <= other`.""" - def lt(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series < other`.""" - def eq(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series == other`.""" - def eq_missing(self, other: Any) -> Self | Expr: - ''' - Method equivalent of equality operator `series == other` where `None == None`. - - This differs from the standard `ne` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - See Also - -------- - ne_missing - eq - - Examples - -------- - >>> s1 = pl.Series("a", [333, 200, None]) - >>> s2 = pl.Series("a", [100, 200, None]) - >>> s1.eq(s2) - shape: (3,) - Series: \'a\' [bool] - [ - false - true - null - ] - >>> s1.eq_missing(s2) - shape: (3,) - Series: \'a\' [bool] - [ - false - true - true - ] - - ''' - def ne(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series != other`.""" - def ne_missing(self, other: Any) -> Self | Expr: - ''' - Method equivalent of equality operator `series != other` where `None == None`. - - This differs from the standard `ne` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - See Also - -------- - eq_missing - ne - - Examples - -------- - >>> s1 = pl.Series("a", [333, 200, None]) - >>> s2 = pl.Series("a", [100, 200, None]) - >>> s1.ne(s2) - shape: (3,) - Series: \'a\' [bool] - [ - true - false - null - ] - >>> s1.ne_missing(s2) - shape: (3,) - Series: \'a\' [bool] - [ - true - false - false - ] - - ''' - def ge(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series >= other`.""" - def gt(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series > other`.""" - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - def __add__(self, other: Any) -> Self | DataFrame | Expr: ... - def __sub__(self, other: Any) -> Self | Expr: ... - def __truediv__(self, other: Any) -> Series | Expr: ... - def __floordiv__(self, other: Any) -> Series | Expr: ... - def __invert__(self) -> Series: ... - def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... - def __mod__(self, other: Any) -> Series | Expr: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: - """ - Numpy __array__ interface protocol. - - Ensures that `np.asarray(pl.Series(..))` works as expected, see - https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. - """ - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: - """Numpy universal functions.""" - def __column_consortium_standard__(self) -> Any: - """ - Provide entry point to the Consortium DataFrame Standard API. - - This is developed and maintained outside of polars. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. - """ - def _repr_html_(self) -> str: - """Format output data in HTML for display in Jupyter Notebooks.""" - def item(self, index: int | None = ...) -> Any: - ''' - Return the Series as a scalar, or return the element at the given index. - - If no index is provided, this is equivalent to `s[0]`, with a check - that the shape is (1,). With an index, this is equivalent to `s[index]`. - - Examples - -------- - >>> s1 = pl.Series("a", [1]) - >>> s1.item() - 1 - >>> s2 = pl.Series("a", [9, 8, 7]) - >>> s2.cum_sum().item(-1) - 24 - - ''' - def estimated_size(self, unit: SizeUnit = ...) -> int | float: - ''' - Return an estimation of the total (heap) allocated size of the Series. - - Estimated size is given in the specified unit (bytes by default). - - This estimation is the sum of the size of its buffers, validity, including - nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the - size of 2 arrays is not the sum of the sizes computed from this function. In - particular, [`StructArray`]\'s size is an upper bound. - - When an array is sliced, its allocated size remains constant because the buffer - unchanged. However, this function will yield a smaller number. This is because - this function returns the visible size of the buffer, not its total capacity. - - FFI buffers are included in this estimation. - - Parameters - ---------- - unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} - Scale the returned size to the given unit. - - Examples - -------- - >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) - >>> s.estimated_size() - 4000000 - >>> s.estimated_size("mb") - 3.814697265625 - - ''' - def sqrt(self) -> Series: - """ - Compute the square root of the elements. - - Syntactic sugar for - - >>> pl.Series([1, 2]) ** 0.5 - shape: (2,) - Series: '' [f64] - [ - 1.0 - 1.414214 - ] - - """ - def cbrt(self) -> Series: - """ - Compute the cube root of the elements. - - Optimization for - - >>> pl.Series([1, 2]) ** (1.0 / 3) - shape: (2,) - Series: '' [f64] - [ - 1.0 - 1.259921 - ] - - """ - def any(self) -> bool | None: - """ - Return whether any of the values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is `None`. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - bool or None - - Examples - -------- - >>> pl.Series([True, False]).any() - True - >>> pl.Series([False, False]).any() - False - >>> pl.Series([None, False]).any() - False - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None - - """ - def all(self) -> bool | None: - """ - Return whether all values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is `None`. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - bool or None - - Examples - -------- - >>> pl.Series([True, True]).all() - True - >>> pl.Series([False, True]).all() - False - >>> pl.Series([None, True]).all() - True - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None - - """ - def log(self, base: float = ...) -> Series: - """Compute the logarithm to a given base.""" - def log1p(self) -> Series: - """Compute the natural logarithm of the input array plus one, element-wise.""" - def log10(self) -> Series: - """Compute the base 10 logarithm of the input array, element-wise.""" - def exp(self) -> Series: - """Compute the exponential, element-wise.""" - def drop_nulls(self) -> Series: - ''' - Drop all null values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nans - - Notes - ----- - A null value is not the same as a NaN value. - To drop NaN values, use :func:`drop_nans`. - - Examples - -------- - >>> s = pl.Series([1.0, None, 3.0, float("nan")]) - >>> s.drop_nulls() - shape: (3,) - Series: \'\' [f64] - [ - 1.0 - 3.0 - NaN - ] - - ''' - def drop_nans(self) -> Series: - ''' - Drop all floating point NaN values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nulls - - Notes - ----- - A NaN value is not the same as a null value. - To drop null values, use :func:`drop_nulls`. - - Examples - -------- - >>> s = pl.Series([1.0, None, 3.0, float("nan")]) - >>> s.drop_nans() - shape: (3,) - Series: \'\' [f64] - [ - 1.0 - null - 3.0 - ] - - ''' - def to_frame(self, name: str | None = ...) -> DataFrame: - ''' - Cast this Series to a DataFrame. - - Parameters - ---------- - name - optionally name/rename the Series column in the new DataFrame. - - Examples - -------- - >>> s = pl.Series("a", [123, 456]) - >>> df = s.to_frame() - >>> df - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 123 │ - │ 456 │ - └─────┘ - - >>> df = s.to_frame("xyz") - >>> df - shape: (2, 1) - ┌─────┐ - │ xyz │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 123 │ - │ 456 │ - └─────┘ - - ''' - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: - ''' - Quick summary statistics of a Series. - - Series with mixed datatypes will return summary statistics for the datatype of - the first value. - - Parameters - ---------- - percentiles - One or more percentiles to include in the summary statistics (if the - Series has a numeric dtype). All values must be in the range `[0, 1]`. - - Notes - ----- - The median is included by default as the 50% percentile. - - Returns - ------- - DataFrame - Mapping with summary statistics of a Series. - - Examples - -------- - >>> series_num = pl.Series([1, 2, 3, 4, 5]) - >>> series_num.describe() - shape: (9, 2) - ┌────────────┬──────────┐ - │ statistic ┆ value │ - │ --- ┆ --- │ - │ str ┆ f64 │ - ╞════════════╪══════════╡ - │ count ┆ 5.0 │ - │ null_count ┆ 0.0 │ - │ mean ┆ 3.0 │ - │ std ┆ 1.581139 │ - │ min ┆ 1.0 │ - │ 25% ┆ 2.0 │ - │ 50% ┆ 3.0 │ - │ 75% ┆ 4.0 │ - │ max ┆ 5.0 │ - └────────────┴──────────┘ - - >>> series_str = pl.Series(["a", "a", None, "b", "c"]) - >>> series_str.describe() - shape: (3, 2) - ┌────────────┬───────┐ - │ statistic ┆ value │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════════╪═══════╡ - │ count ┆ 5 │ - │ null_count ┆ 1 │ - │ unique ┆ 4 │ - └────────────┴───────┘ - - ''' - def sum(self) -> int | float: - ''' - Reduce this Series to the sum value. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.sum() - 6 - - ''' - def mean(self) -> int | float | None: - ''' - Reduce this Series to the mean value. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.mean() - 2.0 - - ''' - def product(self) -> int | float: - """Reduce this Series to the product value.""" - def pow(self, exponent: int | float | None | Series) -> Series: - ''' - Raise to the power of the given exponent. - - Parameters - ---------- - exponent - The exponent. Accepts Series input. - - Examples - -------- - >>> s = pl.Series("foo", [1, 2, 3, 4]) - >>> s.pow(3) - shape: (4,) - Series: \'foo\' [f64] - [ - 1.0 - 8.0 - 27.0 - 64.0 - ] - - ''' - def min(self) -> PythonLiteral | None: - ''' - Get the minimal value in this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.min() - 1 - - ''' - def max(self) -> PythonLiteral | None: - ''' - Get the maximum value in this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.max() - 3 - - ''' - def nan_max(self) -> int | float | date | datetime | timedelta | str: - """ - Get maximum value, but propagate/poison encountered NaN values. - - This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - """ - def nan_min(self) -> int | float | date | datetime | timedelta | str: - """ - Get minimum value, but propagate/poison encountered NaN values. - - This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - """ - def std(self, ddof: int = ...) -> float | None: - ''' - Get the standard deviation of this Series. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.std() - 1.0 - - ''' - def var(self, ddof: int = ...) -> float | None: - ''' - Get variance of this Series. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.var() - 1.0 - - ''' - def median(self) -> float | None: - ''' - Get the median of this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.median() - 2.0 - - ''' - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: - ''' - Get the quantile value of this Series. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.quantile(0.5) - 2.0 - - ''' - def to_dummies(self, separator: str = ...) -> DataFrame: - ''' - Get dummy/indicator variables. - - Parameters - ---------- - separator - Separator/delimiter used when generating column names. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.to_dummies() - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a_1 ┆ a_2 ┆ a_3 │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 0 ┆ 0 │ - │ 0 ┆ 1 ┆ 0 │ - │ 0 ┆ 0 ┆ 1 │ - └─────┴─────┴─────┘ - - ''' - def cut(self, breaks: Sequence[float]) -> Series | DataFrame: - ''' - Bin continuous values into discrete categories. - - Parameters - ---------- - breaks - List of unique cut points. - labels - Names of the categories. The number of labels must be equal to the number - of cut points plus one. - break_point_label - Name of the breakpoint column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - category_label - Name of the category column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - left_closed - Set the intervals to be left-closed instead of right-closed. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - as_series - If set to `False`, return a DataFrame containing the original values, - the breakpoints, and the categories. - - .. deprecated:: 0.19.0 - This parameter will be removed. The same behavior can be achieved by - setting `include_breaks=True`, unnesting the resulting struct Series, - and adding the result to the original Series. - - Returns - ------- - Series - Series of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise a Series of data type :class:`Struct`. - - See Also - -------- - qcut - - Examples - -------- - Divide the column into three categories. - - >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) - >>> s.cut([-1, 1], labels=["a", "b", "c"]) - shape: (5,) - Series: \'foo\' [cat] - [ - "a" - "a" - "b" - "b" - "c" - ] - - Create a DataFrame with the breakpoint and category for each value. - - >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") - >>> s.to_frame().with_columns(cut).unnest("cut") - shape: (5, 3) - ┌─────┬─────────────┬────────────┐ - │ foo ┆ break_point ┆ category │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪═════════════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴─────────────┴────────────┘ - - ''' - def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: - ''' - Bin continuous values into discrete categories based on their quantiles. - - Parameters - ---------- - quantiles - Either a list of quantile probabilities between 0 and 1 or a positive - integer determining the number of bins with uniform probability. - labels - Names of the categories. The number of labels must be equal to the number - of cut points plus one. - left_closed - Set the intervals to be left-closed instead of right-closed. - allow_duplicates - If set to `True`, duplicates in the resulting quantiles are dropped, - rather than raising a `DuplicateError`. This can happen even with unique - probabilities, depending on the data. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - break_point_label - Name of the breakpoint column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - category_label - Name of the category column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - as_series - If set to `False`, return a DataFrame containing the original values, - the breakpoints, and the categories. - - .. deprecated:: 0.19.0 - This parameter will be removed. The same behavior can be achieved by - setting `include_breaks=True`, unnesting the resulting struct Series, - and adding the result to the original Series. - - Returns - ------- - Series - Series of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise a Series of data type :class:`Struct`. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - See Also - -------- - cut - - Examples - -------- - Divide a column into three categories according to pre-defined quantile - probabilities. - - >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) - >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) - shape: (5,) - Series: \'foo\' [cat] - [ - "a" - "a" - "b" - "b" - "c" - ] - - Divide a column into two categories using uniform quantile probabilities. - - >>> s.qcut(2, labels=["low", "high"], left_closed=True) - shape: (5,) - Series: \'foo\' [cat] - [ - "low" - "low" - "high" - "high" - "high" - ] - - Create a DataFrame with the breakpoint and category for each value. - - >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") - >>> s.to_frame().with_columns(cut).unnest("cut") - shape: (5, 3) - ┌─────┬─────────────┬────────────┐ - │ foo ┆ break_point ┆ category │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪═════════════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴─────────────┴────────────┘ - - ''' - def rle(self) -> Series: - ''' - Get the lengths of runs of identical values. - - Returns - ------- - Series - Series of data type :class:`Struct` with Fields "lengths" and "values". - - Examples - -------- - >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) - >>> s.rle().struct.unnest() - shape: (6, 2) - ┌─────────┬────────┐ - │ lengths ┆ values │ - │ --- ┆ --- │ - │ i32 ┆ i64 │ - ╞═════════╪════════╡ - │ 2 ┆ 1 │ - │ 1 ┆ 2 │ - │ 1 ┆ 1 │ - │ 1 ┆ null │ - │ 1 ┆ 1 │ - │ 2 ┆ 3 │ - └─────────┴────────┘ - ''' - def rle_id(self) -> Series: - ''' - Map values to run IDs. - - Similar to RLE, but it maps each value to an ID corresponding to the run into - which it falls. This is especially useful when you want to define groups by - runs of identical values rather than the values themselves. - - Returns - ------- - Series - - See Also - -------- - rle - - Examples - -------- - >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) - >>> s.rle_id() - shape: (8,) - Series: \'s\' [u32] - [ - 0 - 0 - 1 - 2 - 3 - 4 - 5 - 5 - ] - ''' - def hist(self, bins: list[float] | None = ...) -> DataFrame: - ''' - Bin values into buckets and count their occurrences. - - Parameters - ---------- - bins - Discretizations to make. - If None given, we determine the boundaries based on the data. - bin_count - If no bins provided, this will be used to determine - the distance of the bins - - Returns - ------- - DataFrame - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Examples - -------- - >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) - >>> a.hist(bin_count=4) - shape: (5, 3) - ┌─────────────┬─────────────┬─────────┐ - │ break_point ┆ category ┆ a_count │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ cat ┆ u32 │ - ╞═════════════╪═════════════╪═════════╡ - │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ - │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ - │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ - │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ - │ inf ┆ (6.75, inf] ┆ 2 │ - └─────────────┴─────────────┴─────────┘ - - ''' - def value_counts(self) -> DataFrame: - ''' - Count the occurrences of unique values. - - Parameters - ---------- - sort - Sort the output by count in descending order. - If set to `False` (default), the order of the output is random. - parallel - Execute the computation in parallel. - - .. note:: - This option should likely not be enabled in a group by context, - as the computation is already parallelized per group. - - Returns - ------- - DataFrame - Mapping of unique values to their count. - - Examples - -------- - >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) - >>> s.value_counts() # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌───────┬────────┐ - │ color ┆ counts │ - │ --- ┆ --- │ - │ str ┆ u32 │ - ╞═══════╪════════╡ - │ red ┆ 2 │ - │ green ┆ 1 │ - │ blue ┆ 3 │ - └───────┴────────┘ - - Sort the output by count. - - shape: (3, 2) - ┌───────┬────────┐ - │ color ┆ counts │ - │ --- ┆ --- │ - │ str ┆ u32 │ - ╞═══════╪════════╡ - │ blue ┆ 3 │ - │ red ┆ 2 │ - │ green ┆ 1 │ - └───────┴────────┘ - - ''' - def unique_counts(self) -> Series: - ''' - Return a count of the unique values in the order of appearance. - - Examples - -------- - >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) - >>> s.unique_counts() - shape: (3,) - Series: \'id\' [u32] - [ - 1 - 2 - 3 - ] - - ''' - def entropy(self, base: float = ...) -> float | None: - """ - Computes the entropy. - - Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. - - Parameters - ---------- - base - Given base, defaults to `e` - normalize - Normalize pk if it doesn't sum to 1. - - Examples - -------- - >>> a = pl.Series([0.99, 0.005, 0.005]) - >>> a.entropy(normalize=True) - 0.06293300616044681 - >>> b = pl.Series([0.65, 0.10, 0.25]) - >>> b.entropy(normalize=True) - 0.8568409950394724 - - """ - def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: - ''' - Run an expression over a sliding window that increases `1` slot every iteration. - - Parameters - ---------- - expr - Expression to evaluate - min_periods - Number of valid values there should be in the window before the expression - is evaluated. valid values = `length - null_count` - parallel - Run in parallel. Don\'t do this in a group by or another operation that - already has much parallelization. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - This can be really slow as it can have `O(n^2)` complexity. Don\'t use this - for operations that visit all elements. - - Examples - -------- - >>> s = pl.Series("values", [1, 2, 3, 4, 5]) - >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) - shape: (5,) - Series: \'values\' [f64] - [ - 0.0 - -3.0 - -8.0 - -15.0 - -24.0 - ] - - ''' - def alias(self, name: str) -> Series: - ''' - Rename the series. - - Parameters - ---------- - name - The new name. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.alias("b") - shape: (3,) - Series: \'b\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def rename(self, name: str) -> Series: - ''' - Rename this Series. - - Alias for :func:`Series.alias`. - - Parameters - ---------- - name - New name. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.rename("b") - shape: (3,) - Series: \'b\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def chunk_lengths(self) -> list[int]: - ''' - Get the length of each individual chunk. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("a", [4, 5, 6]) - - Concatenate Series with rechunk = True - - >>> pl.concat([s, s2]).chunk_lengths() - [6] - - Concatenate Series with rechunk = False - - >>> pl.concat([s, s2], rechunk=False).chunk_lengths() - [3, 3] - - ''' - def n_chunks(self) -> int: - ''' - Get the number of chunks that this Series contains. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.n_chunks() - 1 - >>> s2 = pl.Series("a", [4, 5, 6]) - - Concatenate Series with rechunk = True - - >>> pl.concat([s, s2]).n_chunks() - 1 - - Concatenate Series with rechunk = False - - >>> pl.concat([s, s2], rechunk=False).n_chunks() - 2 - - ''' - def cum_max(self) -> Series: - ''' - Get an array with the cumulative max computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Examples - -------- - >>> s = pl.Series("s", [3, 5, 1]) - >>> s.cum_max() - shape: (3,) - Series: \'s\' [i64] - [ - 3 - 5 - 5 - ] - - ''' - def cum_min(self) -> Series: - ''' - Get an array with the cumulative min computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Examples - -------- - >>> s = pl.Series("s", [1, 2, 3]) - >>> s.cum_min() - shape: (3,) - Series: \'s\' [i64] - [ - 1 - 1 - 1 - ] - - ''' - def cum_prod(self) -> Series: - ''' - Get an array with the cumulative product computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.cum_prod() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 6 - ] - - ''' - def cum_sum(self) -> Series: - ''' - Get an array with the cumulative sum computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.cum_sum() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 3 - 6 - ] - - ''' - def slice(self, offset: int, length: int | None = ...) -> Series: - ''' - Get a slice of this Series. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4]) - >>> s.slice(1, 2) - shape: (2,) - Series: \'a\' [i64] - [ - 2 - 3 - ] - - ''' - def append(self, other: Series) -> Self: - ''' - Append a Series to this one. - - Parameters - ---------- - other - Series to append. - append_chunks - .. deprecated:: 0.18.8 - This argument will be removed and `append` will change to always - behave like `append_chunks=True` (the previous default). For the - behavior of `append_chunks=False`, use `Series.extend`. - - If set to `True` the append operation will add the chunks from `other` to - self. This is super cheap. - - If set to `False` the append operation will do the same as - `DataFrame.extend` which extends the memory backed by this `Series` with - the values from `other`. - - Different from `append chunks`, `extend` appends the data from `other` to - the underlying memory locations and thus may cause a reallocation (which are - expensive). - - If this does not cause a reallocation, the resulting data structure will not - have any extra chunks and thus will yield faster queries. - - Prefer `extend` over `append_chunks` when you want to do a query after a - single append. For instance during online operations where you add `n` rows - and rerun a query. - - Prefer `append_chunks` over `extend` when you want to append many times - before doing a query. For instance when you read in multiple files and when - to store them in a single `Series`. In the latter case, finish the sequence - of `append_chunks` operations with a `rechunk`. - - Warnings - -------- - This method modifies the series in-place. The series is returned for - convenience only. - - See Also - -------- - extend - - Examples - -------- - >>> a = pl.Series("a", [1, 2, 3]) - >>> b = pl.Series("b", [4, 5]) - >>> a.append(b) - shape: (5,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - 5 - ] - - The resulting series will consist of multiple chunks. - - >>> a.n_chunks() - 2 - - ''' - def extend(self, other: Series) -> Self: - ''' - Extend the memory backed by this Series with the values from another. - - Different from `append`, which adds the chunks from `other` to the chunks of - this series, `extend` appends the data from `other` to the underlying memory - locations and thus may cause a reallocation (which is expensive). - - If this does `not` cause a reallocation, the resulting data structure will not - have any extra chunks and thus will yield faster queries. - - Prefer `extend` over `append` when you want to do a query after a single - append. For instance, during online operations where you add `n` rows - and rerun a query. - - Prefer `append` over `extend` when you want to append many times - before doing a query. For instance, when you read in multiple files and want - to store them in a single `Series`. In the latter case, finish the sequence - of `append` operations with a `rechunk`. - - Parameters - ---------- - other - Series to extend the series with. - - Warnings - -------- - This method modifies the series in-place. The series is returned for - convenience only. - - See Also - -------- - append - - Examples - -------- - >>> a = pl.Series("a", [1, 2, 3]) - >>> b = pl.Series("b", [4, 5]) - >>> a.extend(b) - shape: (5,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - 5 - ] - - The resulting series will consist of a single chunk. - - >>> a.n_chunks() - 1 - - ''' - def filter(self, predicate: Series | list[bool]) -> Self: - ''' - Filter elements by a boolean mask. - - The original order of the remaining elements is preserved. - - Parameters - ---------- - predicate - Boolean mask. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> mask = pl.Series("", [True, False, True]) - >>> s.filter(mask) - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 3 - ] - - ''' - def head(self, n: int = ...) -> Series: - ''' - Get the first `n` elements. - - Parameters - ---------- - n - Number of elements to return. If a negative value is passed, return all - elements except the last `abs(n)`. - - See Also - -------- - tail, slice - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.head(3) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - Pass a negative value to get all rows `except` the last `abs(n)`. - - >>> s.head(-3) - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 2 - ] - - ''' - def tail(self, n: int = ...) -> Series: - ''' - Get the last `n` elements. - - Parameters - ---------- - n - Number of elements to return. If a negative value is passed, return all - elements except the first `abs(n)`. - - See Also - -------- - head, slice - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.tail(3) - shape: (3,) - Series: \'a\' [i64] - [ - 3 - 4 - 5 - ] - - Pass a negative value to get all rows `except` the first `abs(n)`. - - >>> s.tail(-3) - shape: (2,) - Series: \'a\' [i64] - [ - 4 - 5 - ] - - ''' - def limit(self, n: int = ...) -> Series: - """ - Get the first `n` elements. - - Alias for :func:`Series.head`. - - Parameters - ---------- - n - Number of elements to return. If a negative value is passed, return all - elements except the last `abs(n)`. - - See Also - -------- - head - - """ - def gather_every(self, n: int) -> Series: - ''' - Take every nth value in the Series and return as new Series. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4]) - >>> s.gather_every(2) - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 3 - ] - - ''' - def sort(self) -> Self: - ''' - Sort this Series. - - Parameters - ---------- - descending - Sort in descending order. - in_place - Sort in-place. - - Examples - -------- - >>> s = pl.Series("a", [1, 3, 4, 2]) - >>> s.sort() - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - ] - >>> s.sort(descending=True) - shape: (4,) - Series: \'a\' [i64] - [ - 4 - 3 - 2 - 1 - ] - - ''' - def top_k(self, k: int | IntoExprColumn = ...) -> Series: - ''' - Return the `k` largest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - bottom_k - - Examples - -------- - >>> s = pl.Series("a", [2, 5, 1, 4, 3]) - >>> s.top_k(3) - shape: (3,) - Series: \'a\' [i64] - [ - 5 - 4 - 3 - ] - - ''' - def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: - ''' - Return the `k` smallest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - top_k - - Examples - -------- - >>> s = pl.Series("a", [2, 5, 1, 4, 3]) - >>> s.bottom_k(3) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def arg_sort(self) -> Series: - ''' - Get the index values that would sort this Series. - - Parameters - ---------- - descending - Sort in descending order. - nulls_last - Place null values last instead of first. - - Examples - -------- - >>> s = pl.Series("a", [5, 3, 4, 1, 2]) - >>> s.arg_sort() - shape: (5,) - Series: \'a\' [u32] - [ - 3 - 4 - 1 - 2 - 0 - ] - - ''' - def arg_unique(self) -> Series: - ''' - Get unique index as Series. - - Returns - ------- - Series - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.arg_unique() - shape: (3,) - Series: \'a\' [u32] - [ - 0 - 1 - 3 - ] - - ''' - def arg_min(self) -> int | None: - ''' - Get the index of the minimal value. - - Returns - ------- - int - - Examples - -------- - >>> s = pl.Series("a", [3, 2, 1]) - >>> s.arg_min() - 2 - - ''' - def arg_max(self) -> int | None: - ''' - Get the index of the maximal value. - - Returns - ------- - int - - Examples - -------- - >>> s = pl.Series("a", [3, 2, 1]) - >>> s.arg_max() - 0 - - ''' - def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: - """ - Find indices where elements should be inserted to maintain order. - - .. math:: a[i-1] < v <= a[i] - - Parameters - ---------- - element - Expression or scalar value. - side : {'any', 'left', 'right'} - If 'any', the index of the first suitable location found is given. - If 'left', the index of the leftmost suitable location found is given. - If 'right', return the rightmost suitable location found is given. - - """ - def unique(self) -> Series: - ''' - Get unique elements in series. - - Parameters - ---------- - maintain_order - Maintain order of data. This requires more work. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.unique().sort() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: - ''' - Take values by index. - - Parameters - ---------- - indices - Index location used for selection. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4]) - >>> s.gather([1, 3]) - shape: (2,) - Series: \'a\' [i64] - [ - 2 - 4 - ] - - ''' - def null_count(self) -> int: - """Count the null values in this Series.""" - def has_validity(self) -> bool: - """ - Return True if the Series has a validity bitmask. - - If there is no mask, it means that there are no `null` values. - - Notes - ----- - While the *absence* of a validity bitmask guarantees that a Series does not - have `null` values, the converse is not true, eg: the *presence* of a - bitmask does not mean that there are null values, as every value of the - bitmask could be `false`. - - To confirm that a column has `null` values use :func:`null_count`. - - """ - def is_empty(self) -> bool: - ''' - Check if the Series is empty. - - Examples - -------- - >>> s = pl.Series("a", [], dtype=pl.Float32) - >>> s.is_empty() - True - - ''' - def is_sorted(self) -> bool: - """ - Check if the Series is sorted. - - Parameters - ---------- - descending - Check if the Series is sorted in descending order - - """ - def not_(self) -> Series: - ''' - Negate a boolean Series. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [True, False, False]) - >>> s.not_() - shape: (3,) - Series: \'a\' [bool] - [ - false - true - true - ] - - ''' - def is_null(self) -> Series: - ''' - Returns a boolean Series indicating which values are null. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) - >>> s.is_null() - shape: (4,) - Series: \'a\' [bool] - [ - false - false - false - true - ] - - ''' - def is_not_null(self) -> Series: - ''' - Returns a boolean Series indicating which values are not null. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) - >>> s.is_not_null() - shape: (4,) - Series: \'a\' [bool] - [ - true - true - true - false - ] - - ''' - def is_finite(self) -> Series: - ''' - Returns a boolean Series indicating which values are finite. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, np.inf]) - >>> s.is_finite() - shape: (3,) - Series: \'a\' [bool] - [ - true - true - false - ] - - ''' - def is_infinite(self) -> Series: - ''' - Returns a boolean Series indicating which values are infinite. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, np.inf]) - >>> s.is_infinite() - shape: (3,) - Series: \'a\' [bool] - [ - false - false - true - ] - - ''' - def is_nan(self) -> Series: - ''' - Returns a boolean Series indicating which values are not NaN. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) - >>> s.is_nan() - shape: (4,) - Series: \'a\' [bool] - [ - false - false - false - true - ] - - ''' - def is_not_nan(self) -> Series: - ''' - Returns a boolean Series indicating which values are not NaN. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) - >>> s.is_not_nan() - shape: (4,) - Series: \'a\' [bool] - [ - true - true - true - false - ] - - ''' - def is_in(self, other: Series | Collection[Any]) -> Series: - ''' - Check if elements of this Series are in the other Series. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("b", [2, 4]) - >>> s2.is_in(s) - shape: (2,) - Series: \'b\' [bool] - [ - true - false - ] - - >>> # check if some values are a member of sublists - >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) - >>> optional_members = pl.Series("optional_members", [1, 2, 3]) - >>> print(sets) - shape: (3,) - Series: \'sets\' [list[i64]] - [ - [1, 2, 3] - [1, 2] - [9, 10] - ] - >>> print(optional_members) - shape: (3,) - Series: \'optional_members\' [i64] - [ - 1 - 2 - 3 - ] - >>> optional_members.is_in(sets) - shape: (3,) - Series: \'optional_members\' [bool] - [ - true - true - false - ] - - ''' - def arg_true(self) -> Series: - ''' - Get index values where Boolean Series evaluate True. - - Returns - ------- - Series - Series of data type :class:`UInt32`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> (s == 2).arg_true() - shape: (1,) - Series: \'a\' [u32] - [ - 1 - ] - - ''' - def is_unique(self) -> Series: - ''' - Get mask of all unique values. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.is_unique() - shape: (4,) - Series: \'a\' [bool] - [ - true - false - false - true - ] - - ''' - def is_first_distinct(self) -> Series: - """ - Return a boolean mask indicating the first occurrence of each distinct value. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series([1, 1, 2, 3, 2]) - >>> s.is_first_distinct() - shape: (5,) - Series: '' [bool] - [ - true - false - true - true - false - ] - - """ - def is_last_distinct(self) -> Series: - """ - Return a boolean mask indicating the last occurrence of each distinct value. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series([1, 1, 2, 3, 2]) - >>> s.is_last_distinct() - shape: (5,) - Series: '' [bool] - [ - false - true - false - true - true - ] - - """ - def is_duplicated(self) -> Series: - ''' - Get mask of all duplicated values. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.is_duplicated() - shape: (4,) - Series: \'a\' [bool] - [ - false - true - true - false - ] - - ''' - def explode(self) -> Series: - """ - Explode a list Series. - - This means that every item is expanded to a new row. - - Returns - ------- - Series - Series with the data type of the list elements. - - See Also - -------- - Series.list.explode : Explode a list column. - Series.str.explode : Explode a string column. - - """ - def equals(self, other: Series) -> bool: - ''' - Check whether the Series is equal to another Series. - - Parameters - ---------- - other - Series to compare with. - null_equal - Consider null values as equal. - strict - Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a - `pl.Int64` will return `False`. - - See Also - -------- - assert_series_equal - - Examples - -------- - >>> s1 = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("b", [4, 5, 6]) - >>> s1.equals(s1) - True - >>> s1.equals(s2) - False - ''' - def len(self) -> int: - ''' - Return the number of elements in this Series. - - Null values are treated like regular elements in this context. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, None]) - >>> s.len() - 3 - - ''' - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: - ''' - Cast between data types. - - Parameters - ---------- - dtype - DataType to cast to. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> s = pl.Series("a", [True, False, True]) - >>> s - shape: (3,) - Series: \'a\' [bool] - [ - true - false - true - ] - - >>> s.cast(pl.UInt32) - shape: (3,) - Series: \'a\' [u32] - [ - 1 - 0 - 1 - ] - - ''' - def to_physical(self) -> Series: - ''' - Cast to physical representation of the logical dtype. - - - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` - - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` - - `List(inner)` -> `List(physical of inner)` - - Other data types will be left unchanged. - - Examples - -------- - Replicating the pandas - `pd.Series.factorize - `_ - method. - - >>> s = pl.Series("values", ["a", None, "x", "a"]) - >>> s.cast(pl.Categorical).to_physical() - shape: (4,) - Series: \'values\' [u32] - [ - 0 - null - 1 - 0 - ] - - ''' - def to_list(self) -> list[Any]: - ''' - Convert this Series to a Python List. This operation clones data. - - Parameters - ---------- - use_pyarrow - Use pyarrow for the conversion. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.to_list() - [1, 2, 3] - >>> type(s.to_list()) - - - ''' - def rechunk(self) -> Self: - """ - Create a single chunk of memory for this Series. - - Parameters - ---------- - in_place - In place or not. - - """ - def reverse(self) -> Series: - ''' - Return Series in reverse order. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) - >>> s.reverse() - shape: (3,) - Series: \'a\' [i8] - [ - 3 - 2 - 1 - ] - - ''' - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: - ''' - Get a boolean mask of the values that fall between the given start/end values. - - Parameters - ---------- - lower_bound - Lower bound value. Accepts expression input. Non-expression inputs - (including strings) are parsed as literals. - upper_bound - Upper bound value. Accepts expression input. Non-expression inputs - (including strings) are parsed as literals. - closed : {\'both\', \'left\', \'right\', \'none\'} - Define which sides of the interval are closed (inclusive). - - Examples - -------- - >>> s = pl.Series("num", [1, 2, 3, 4, 5]) - >>> s.is_between(2, 4) - shape: (5,) - Series: \'num\' [bool] - [ - false - true - true - true - false - ] - - Use the `closed` argument to include or exclude the values at the bounds: - - >>> s.is_between(2, 4, closed="left") - shape: (5,) - Series: \'num\' [bool] - [ - false - true - true - false - false - ] - - You can also use strings as well as numeric/temporal values: - - >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) - >>> s.is_between("b", "d", closed="both") - shape: (5,) - Series: \'s\' [bool] - [ - false - true - true - true - false - ] - - ''' - def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: - ''' - Convert this Series to numpy. - - This operation may clone data but is completely safe. Note that: - - - data which is purely numeric AND without null values is not cloned; - - floating point `nan` values can be zero-copied; - - booleans can\'t be zero-copied. - - To ensure that no data is cloned, set `zero_copy_only=True`. - - Parameters - ---------- - *args - args will be sent to pyarrow.Array.to_numpy. - zero_copy_only - If True, an exception will be raised if the conversion to a numpy - array would require copying the underlying data (e.g. in presence - of nulls, or for non-primitive types). - writable - For numpy arrays created with zero copy (view on the Arrow data), - the resulting array is not writable (Arrow data is immutable). - By setting this to True, a copy of the array is made to ensure - it is writable. - use_pyarrow - Use `pyarrow.Array.to_numpy - `_ - - for the conversion to numpy. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> arr = s.to_numpy() - >>> arr # doctest: +IGNORE_RESULT - array([1, 2, 3], dtype=int64) - >>> type(arr) - - - ''' - def _view(self) -> SeriesView: - ''' - Get a view into this Series data with a numpy array. - - This operation doesn\'t clone data, but does not include missing values. - - Returns - ------- - SeriesView - - Parameters - ---------- - ignore_nulls - If True then nulls are converted to 0. - If False then an Exception is raised if nulls are present. - - Examples - -------- - >>> s = pl.Series("a", [1, None]) - >>> s._view(ignore_nulls=True) - SeriesView([1, 0]) - - ''' - def to_arrow(self) -> pa.Array: - ''' - Get the underlying Arrow Array. - - If the Series contains only a single chunk this operation is zero copy. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s = s.to_arrow() - >>> s # doctest: +ELLIPSIS - - [ - 1, - 2, - 3 - ] - - ''' - def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: - ''' - Convert this Series to a pandas Series. - - This requires that :mod:`pandas` and :mod:`pyarrow` are installed. - This operation clones data, unless `use_pyarrow_extension_array=True`. - - Parameters - ---------- - use_pyarrow_extension_array - Further operations on this Pandas series, might trigger conversion to numpy. - Use PyArrow backed-extension array instead of numpy array for pandas - Series. This allows zero copy operations and preservation of nulls - values. - Further operations on this pandas Series, might trigger conversion - to NumPy arrays if that operation is not supported by pyarrow compute - functions. - kwargs - Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. - - Examples - -------- - >>> s1 = pl.Series("a", [1, 2, 3]) - >>> s1.to_pandas() - 0 1 - 1 2 - 2 3 - Name: a, dtype: int64 - >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP - 0 1 - 1 2 - 2 3 - Name: a, dtype: int64[pyarrow] - >>> s2 = pl.Series("b", [1, 2, None, 4]) - >>> s2.to_pandas() - 0 1.0 - 1 2.0 - 2 NaN - 3 4.0 - Name: b, dtype: float64 - >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP - 0 1 - 1 2 - 2 - 3 4 - Name: b, dtype: int64[pyarrow] - - ''' - def to_init_repr(self, n: int = ...) -> str: - ''' - Convert Series to instantiatable string representation. - - Parameters - ---------- - n - Only use first n elements. - - See Also - -------- - polars.Series.to_init_repr - polars.from_repr - - Examples - -------- - >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) - >>> print(s.to_init_repr()) - pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) - >>> s_from_str_repr = eval(s.to_init_repr()) - >>> s_from_str_repr - shape: (4,) - Series: \'a\' [i16] - [ - 1 - 2 - null - 4 - ] - - ''' - def set(self, filter: Series, value: int | float | str | bool | None) -> Series: - ''' - Set masked values. - - Parameters - ---------- - filter - Boolean mask. - value - Value with which to replace the masked values. - - Notes - ----- - Use of this function is frequently an anti-pattern, as it can - block optimisation (predicate pushdown, etc). Consider using - `pl.when(predicate).then(value).otherwise(self)` instead. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.set(s == 2, 10) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 10 - 3 - ] - - It is better to implement this as follows: - - >>> s.to_frame().select( - ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) - ... ) - shape: (3, 1) - ┌─────────┐ - │ literal │ - │ --- │ - │ i64 │ - ╞═════════╡ - │ 1 │ - │ 10 │ - │ 3 │ - └─────────┘ - - ''' - def scatter(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: - ''' - Set values at the index locations. - - Parameters - ---------- - indices - Integers representing the index locations. - values - Replacement values. - - Notes - ----- - Use of this function is frequently an anti-pattern, as it can - block optimization (predicate pushdown, etc). Consider using - `pl.when(predicate).then(value).otherwise(self)` instead. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.scatter(1, 10) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 10 - 3 - ] - - It is better to implement this as follows: - - >>> s.to_frame().with_row_count("row_nr").select( - ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) - ... ) - shape: (3, 1) - ┌─────────┐ - │ literal │ - │ --- │ - │ i64 │ - ╞═════════╡ - │ 1 │ - │ 10 │ - │ 3 │ - └─────────┘ - - ''' - def clear(self, n: int = ...) -> Series: - ''' - Create an empty copy of the current Series, with zero to \'n\' elements. - - The copy has an identical name/dtype, but no data. - - Parameters - ---------- - n - Number of (empty) elements to return in the cleared frame. - - See Also - -------- - clone : Cheap deepcopy/clone. - - Examples - -------- - >>> s = pl.Series("a", [None, True, False]) - >>> s.clear() - shape: (0,) - Series: \'a\' [bool] - [ - ] - - >>> s.clear(n=2) - shape: (2,) - Series: \'a\' [bool] - [ - null - null - ] - - ''' - def clone(self) -> Self: - ''' - Create a copy of this Series. - - This is a cheap operation that does not copy data. - - See Also - -------- - clear : Create an empty copy of the current Series, with identical - schema but no data. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.clone() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def fill_nan(self, value: int | float | Expr | None) -> Series: - ''' - Fill floating point NaN value with a fill value. - - Parameters - ---------- - value - Value used to fill NaN values. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, float("nan")]) - >>> s.fill_nan(0) - shape: (4,) - Series: \'a\' [f64] - [ - 1.0 - 2.0 - 3.0 - 0.0 - ] - - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: - ''' - Fill null values using the specified value or strategy. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, None]) - >>> s.fill_null(strategy="forward") - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 3 - ] - >>> s.fill_null(strategy="min") - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 1 - ] - >>> s = pl.Series("b", ["x", None, "z"]) - >>> s.fill_null(pl.lit("")) - shape: (3,) - Series: \'b\' [str] - [ - "x" - "" - "z" - ] - - ''' - def floor(self) -> Series: - ''' - Rounds down to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) - >>> s.floor() - shape: (3,) - Series: \'a\' [f64] - [ - 1.0 - 2.0 - 3.0 - ] - - ''' - def ceil(self) -> Series: - ''' - Rounds up to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) - >>> s.ceil() - shape: (3,) - Series: \'a\' [f64] - [ - 2.0 - 3.0 - 4.0 - ] - - ''' - def round(self, decimals: int = ...) -> Series: - ''' - Round underlying floating point data by `decimals` digits. - - Examples - -------- - >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) - >>> s.round(2) - shape: (3,) - Series: \'a\' [f64] - [ - 1.12 - 2.57 - 3.9 - ] - - Parameters - ---------- - decimals - number of decimals to round by. - - ''' - def round_sig_figs(self, digits: int) -> Series: - """ - Round to a number of significant figures. - - Parameters - ---------- - digits - Number of significant figures to round to. - - Examples - -------- - >>> s = pl.Series([0.01234, 3.333, 1234.0]) - >>> s.round_sig_figs(2) - shape: (3,) - Series: '' [f64] - [ - 0.012 - 3.3 - 1200.0 - ] - - """ - def dot(self, other: Series | ArrayLike) -> float | None: - ''' - Compute the dot/inner product between two Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) - >>> s.dot(s2) - 32.0 - - Parameters - ---------- - other - Series (or array) to compute dot product with. - - ''' - def mode(self) -> Series: - ''' - Compute the most occurring value(s). - - Can return multiple Values. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.mode() - shape: (1,) - Series: \'a\' [i64] - [ - 2 - ] - - ''' - def sign(self) -> Series: - ''' - Compute the element-wise indication of the sign. - - The returned values can be -1, 0, or 1: - - * -1 if x < 0. - * 0 if x == 0. - * 1 if x > 0. - - (null values are preserved as-is). - - Examples - -------- - >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) - >>> s.sign() - shape: (5,) - Series: \'a\' [i64] - [ - -1 - 0 - 0 - 1 - null - ] - - ''' - def sin(self) -> Series: - ''' - Compute the element-wise value for the sine. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.sin() - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 1.0 - 1.2246e-16 - ] - - ''' - def cos(self) -> Series: - ''' - Compute the element-wise value for the cosine. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.cos() - shape: (3,) - Series: \'a\' [f64] - [ - 1.0 - 6.1232e-17 - -1.0 - ] - - ''' - def tan(self) -> Series: - ''' - Compute the element-wise value for the tangent. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.tan() - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 1.6331e16 - -1.2246e-16 - ] - - ''' - def cot(self) -> Series: - ''' - Compute the element-wise value for the cotangent. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.cot() - shape: (3,) - Series: \'a\' [f64] - [ - inf - 6.1232e-17 - -8.1656e15 - ] - - ''' - def arcsin(self) -> Series: - ''' - Compute the element-wise value for the inverse sine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arcsin() - shape: (3,) - Series: \'a\' [f64] - [ - 1.570796 - 0.0 - -1.570796 - ] - - ''' - def arccos(self) -> Series: - ''' - Compute the element-wise value for the inverse cosine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arccos() - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 1.570796 - 3.141593 - ] - - ''' - def arctan(self) -> Series: - ''' - Compute the element-wise value for the inverse tangent. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arctan() - shape: (3,) - Series: \'a\' [f64] - [ - 0.785398 - 0.0 - -0.785398 - ] - - ''' - def arcsinh(self) -> Series: - ''' - Compute the element-wise value for the inverse hyperbolic sine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arcsinh() - shape: (3,) - Series: \'a\' [f64] - [ - 0.881374 - 0.0 - -0.881374 - ] - - ''' - def arccosh(self) -> Series: - ''' - Compute the element-wise value for the inverse hyperbolic cosine. - - Examples - -------- - >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) - >>> s.arccosh() - shape: (4,) - Series: \'a\' [f64] - [ - 2.292432 - 0.0 - NaN - NaN - ] - - ''' - def arctanh(self) -> Series: - ''' - Compute the element-wise value for the inverse hyperbolic tangent. - - Examples - -------- - >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) - >>> s.arctanh() - shape: (7,) - Series: \'a\' [f64] - [ - NaN - inf - 0.549306 - 0.0 - -0.549306 - -inf - NaN - ] - - ''' - def sinh(self) -> Series: - ''' - Compute the element-wise value for the hyperbolic sine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.sinh() - shape: (3,) - Series: \'a\' [f64] - [ - 1.175201 - 0.0 - -1.175201 - ] - - ''' - def cosh(self) -> Series: - ''' - Compute the element-wise value for the hyperbolic cosine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.cosh() - shape: (3,) - Series: \'a\' [f64] - [ - 1.543081 - 1.0 - 1.543081 - ] - - ''' - def tanh(self) -> Series: - ''' - Compute the element-wise value for the hyperbolic tangent. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.tanh() - shape: (3,) - Series: \'a\' [f64] - [ - 0.761594 - 0.0 - -0.761594 - ] - - ''' - def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - ''' - Map a custom/user-defined function (UDF) over elements in this Series. - - .. warning:: - This method is much slower than the native expressions API. - Only use it if you cannot implement your logic otherwise. - - If the function returns a different datatype, the return_dtype arg should - be set, otherwise the method will fail. - - Implementing logic using a Python function is almost always *significantly* - slower and more memory intensive than implementing the same logic using - the native expression API because: - - - The native expression engine runs in Rust; UDFs run in Python. - - Use of Python UDFs forces the DataFrame to be materialized in memory. - - Polars-native expressions can be parallelised (UDFs typically cannot). - - Polars-native expressions can be logically optimised (UDFs cannot). - - Wherever possible you should strongly prefer the native expression API - to achieve the best performance. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output datatype. If none is given, the same datatype as this Series will be - used. - skip_nulls - Nulls will be skipped and not passed to the python function. - This is faster because python can be skipped and because we call - more specialized functions. - - Warnings - -------- - If `return_dtype` is not provided, this may lead to unexpected results. - We allow this, but it is considered a bug in the user\'s query. - - Notes - ----- - If your function is expensive and you don\'t want it to be called more than - once for a given input, consider applying an `@lru_cache` decorator to it. - If your data is suitable you may achieve *significant* speedups. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP - shape: (3,) - Series: \'a\' [i64] - [ - 11 - 12 - 13 - ] - - Returns - ------- - Series - - ''' - def shift(self, n: int = ...) -> Series: - """ - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. Accepts expression input. - Non-expression inputs are parsed as literals. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> s = pl.Series([1, 2, 3, 4]) - >>> s.shift() - shape: (4,) - Series: '' [i64] - [ - null - 1 - 2 - 3 - ] - - Pass a negative value to shift in the opposite direction instead. - - >>> s.shift(-2) - shape: (4,) - Series: '' [i64] - [ - 3 - 4 - null - null - ] - - Specify `fill_value` to fill the resulting null values. - - >>> s.shift(-2, fill_value=100) - shape: (4,) - Series: '' [i64] - [ - 3 - 4 - 100 - 100 - ] - - """ - def zip_with(self, mask: Series, other: Series) -> Self: - """ - Take values from self or other based on the given mask. - - Where mask evaluates true, take values from self. Where mask evaluates false, - take values from other. - - Parameters - ---------- - mask - Boolean Series. - other - Series of same type. - - Returns - ------- - Series - - Examples - -------- - >>> s1 = pl.Series([1, 2, 3, 4, 5]) - >>> s2 = pl.Series([5, 4, 3, 2, 1]) - >>> s1.zip_with(s1 < s2, s2) - shape: (5,) - Series: '' [i64] - [ - 1 - 2 - 3 - 2 - 1 - ] - >>> mask = pl.Series([True, False, True, False, True]) - >>> s1.zip_with(mask, s2) - shape: (5,) - Series: '' [i64] - [ - 1 - 4 - 3 - 2 - 5 - ] - - """ - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling min (moving min) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their min. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [100, 200, 300, 400, 500]) - >>> s.rolling_min(window_size=3) - shape: (5,) - Series: \'a\' [i64] - [ - null - null - 100 - 200 - 300 - ] - - ''' - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling max (moving max) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their max. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [100, 200, 300, 400, 500]) - >>> s.rolling_max(window_size=2) - shape: (5,) - Series: \'a\' [i64] - [ - null - 200 - 300 - 400 - 500 - ] - - ''' - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling mean (moving mean) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their mean. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [100, 200, 300, 400, 500]) - >>> s.rolling_mean(window_size=2) - shape: (5,) - Series: \'a\' [f64] - [ - null - 150.0 - 250.0 - 350.0 - 450.0 - ] - - ''' - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling sum (moving sum) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their sum. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length of the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.rolling_sum(window_size=2) - shape: (5,) - Series: \'a\' [i64] - [ - null - 3 - 5 - 7 - 9 - ] - - ''' - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling std dev. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their std dev. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_std(window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.0 - 1.0 - 1.527525 - 2.0 - ] - - ''' - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling variance. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their variance. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_var(window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.0 - 1.0 - 2.333333 - 4.0 - ] - - ''' - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a custom rolling window function. - - .. warning:: - Computing custom functions is extremely slow. Use specialized rolling - functions such as :func:`Series.rolling_sum` if at all possible. - - Parameters - ---------- - function - Custom aggregation function. - window_size - Size of the window. The window at a given row will include the row - itself and the `window_size - 1` elements before it. - weights - A list of weights with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window. - - Warnings - -------- - - - Examples - -------- - >>> from numpy import nansum - >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) - >>> s.rolling_map(nansum, window_size=3) - shape: (5,) - Series: \'\' [f64] - [ - null - null - 22.0 - 11.0 - 17.0 - ] - - ''' - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling median. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_median(window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 2.0 - 3.0 - 4.0 - 6.0 - ] - - ''' - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling quantile. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_quantile(quantile=0.33, window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.0 - 2.0 - 3.0 - 4.0 - ] - >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.66 - 2.66 - 3.66 - 5.32 - ] - - ''' - def rolling_skew(self, window_size: int) -> Series: - """ - Compute a rolling skew. - - The window at a given row includes the row itself and the - `window_size - 1` elements before it. - - Parameters - ---------- - window_size - Integer size of the rolling window. - bias - If False, the calculations are corrected for statistical bias. - - Examples - -------- - >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) - shape: (4,) - Series: '' [f64] - [ - null - null - 0.381802 - 0.47033 - ] - - Note how the values match - - >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() - (0.38180177416060584, 0.47033046033698594) - - """ - def sample(self, n: int | None = ...) -> Series: - ''' - Sample from this Series. - - Parameters - ---------- - n - Number of items to return. Cannot be used with `fraction`. Defaults to 1 if - `fraction` is None. - fraction - Fraction of items to return. Cannot be used with `n`. - with_replacement - Allow values to be sampled more than once. - shuffle - Shuffle the order of sampled data points. - seed - Seed for the random number generator. If set to None (default), a - random seed is generated for each sample operation. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 5 - ] - - ''' - def peak_max(self) -> Self: - ''' - Get a boolean mask of the local maximum peaks. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.peak_max() - shape: (5,) - Series: \'a\' [bool] - [ - false - false - false - false - true - ] - - ''' - def peak_min(self) -> Self: - ''' - Get a boolean mask of the local minimum peaks. - - Examples - -------- - >>> s = pl.Series("a", [4, 1, 3, 2, 5]) - >>> s.peak_min() - shape: (5,) - Series: \'a\' [bool] - [ - false - true - false - true - false - ] - - ''' - def n_unique(self) -> int: - ''' - Count the number of unique values in this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.n_unique() - 3 - - ''' - def shrink_to_fit(self) -> Series: - """ - Shrink Series memory usage. - - Shrinks the underlying array capacity to exactly fit the actual data. - (Note that this function does not change the Series data type). - - """ - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: - ''' - Hash the Series. - - The hash value is of type `UInt64`. - - Parameters - ---------- - seed - Random seed parameter. Defaults to 0. - seed_1 - Random seed parameter. Defaults to `seed` if not set. - seed_2 - Random seed parameter. Defaults to `seed` if not set. - seed_3 - Random seed parameter. Defaults to `seed` if not set. - - Notes - ----- - This implementation of :func:`hash` does not guarantee stable results - across different Polars versions. Its stability is only guaranteed within a - single version. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.hash(seed=42) # doctest: +IGNORE_RESULT - shape: (3,) - Series: \'a\' [u64] - [ - 10734580197236529959 - 3022416320763508302 - 13756996518000038261 - ] - - ''' - def reinterpret(self) -> Series: - """ - Reinterpret the underlying bits as a signed/unsigned integer. - - This operation is only allowed for 64bit integers. For lower bits integers, - you can safely use that cast operation. - - Parameters - ---------- - signed - If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. - - """ - def interpolate(self, method: InterpolationMethod = ...) -> Series: - ''' - Fill null values using interpolation. - - Parameters - ---------- - method : {\'linear\', \'nearest\'} - Interpolation method. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, None, None, 5]) - >>> s.interpolate() - shape: (5,) - Series: \'a\' [f64] - [ - 1.0 - 2.0 - 3.0 - 4.0 - 5.0 - ] - - ''' - def abs(self) -> Series: - """ - Compute absolute values. - - Same as `abs(series)`. - """ - def rank(self, method: RankMethod = ...) -> Series: - ''' - Assign ranks to data, dealing with ties appropriately. - - Parameters - ---------- - method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} - The method used to assign ranks to tied elements. - The following methods are available (default is \'average\'): - - - \'average\' : The average of the ranks that would have been assigned to - all the tied values is assigned to each value. - - \'min\' : The minimum of the ranks that would have been assigned to all - the tied values is assigned to each value. (This is also referred to - as "competition" ranking.) - - \'max\' : The maximum of the ranks that would have been assigned to all - the tied values is assigned to each value. - - \'dense\' : Like \'min\', but the rank of the next highest element is - assigned the rank immediately after those assigned to the tied - elements. - - \'ordinal\' : All values are given a distinct rank, corresponding to - the order that the values occur in the Series. - - \'random\' : Like \'ordinal\', but the rank for ties is not dependent - on the order that the values occur in the Series. - descending - Rank in descending order. - seed - If `method="random"`, use this as seed. - - Examples - -------- - The \'average\' method: - - >>> s = pl.Series("a", [3, 6, 1, 1, 6]) - >>> s.rank() - shape: (5,) - Series: \'a\' [f64] - [ - 3.0 - 4.5 - 1.5 - 1.5 - 4.5 - ] - - The \'ordinal\' method: - - >>> s = pl.Series("a", [3, 6, 1, 1, 6]) - >>> s.rank("ordinal") - shape: (5,) - Series: \'a\' [u32] - [ - 3 - 4 - 1 - 2 - 5 - ] - - ''' - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: - ''' - Calculate the first discrete difference between shifted items. - - Parameters - ---------- - n - Number of slots to shift. - null_behavior : {\'ignore\', \'drop\'} - How to handle null values. - - Examples - -------- - >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) - >>> s.diff() - shape: (5,) - Series: \'s\' [i8] - [ - null - -10 - 20 - -5 - 10 - ] - - >>> s.diff(n=2) - shape: (5,) - Series: \'s\' [i8] - [ - null - null - 10 - 15 - 5 - ] - - >>> s.diff(n=2, null_behavior="drop") - shape: (3,) - Series: \'s\' [i8] - [ - 10 - 15 - 5 - ] - - ''' - def pct_change(self, n: int | IntoExprColumn = ...) -> Series: - """ - Computes percentage change between values. - - Percentage change (as fraction) between current element and most-recent - non-null element at least `n` period(s) before the current element. - - Computes the change from the previous row by default. - - Parameters - ---------- - n - periods to shift for forming percent change. - - Examples - -------- - >>> pl.Series(range(10)).pct_change() - shape: (10,) - Series: '' [f64] - [ - null - inf - 1.0 - 0.5 - 0.333333 - 0.25 - 0.2 - 0.166667 - 0.142857 - 0.125 - ] - - >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) - shape: (10,) - Series: '' [f64] - [ - null - null - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - ] - - """ - def skew(self) -> float | None: - """ - Compute the sample skewness of a data set. - - For normally distributed data, the skewness should be about zero. For - unimodal continuous distributions, a skewness value greater than zero means - that there is more weight in the right tail of the distribution. The - function `skewtest` can be used to determine if the skewness value - is close enough to zero, statistically speaking. - - - See scipy.stats for more information. - - Parameters - ---------- - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - Notes - ----- - The sample skewness is computed as the Fisher-Pearson coefficient - of skewness, i.e. - - .. math:: g_1=\\frac{m_3}{m_2^{3/2}} - - where - - .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i - - is the biased sample :math:`i\\texttt{th}` central moment, and - :math:`\\bar{x}` is - the sample mean. If `bias` is False, the calculations are - corrected for bias and the value computed is the adjusted - Fisher-Pearson standardized moment coefficient, i.e. - - .. math:: - G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} - - """ - def kurtosis(self) -> float | None: - """ - Compute the kurtosis (Fisher or Pearson) of a dataset. - - Kurtosis is the fourth central moment divided by the square of the - variance. If Fisher's definition is used, then 3.0 is subtracted from - the result to give 0.0 for a normal distribution. - If bias is False then the kurtosis is calculated using k statistics to - eliminate bias coming from biased moment estimators - - See scipy.stats for more information - - Parameters - ---------- - fisher : bool, optional - If True, Fisher's definition is used (normal ==> 0.0). If False, - Pearson's definition is used (normal ==> 3.0). - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - """ - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: - """ - Set values outside the given boundaries to the boundary value. - - Parameters - ---------- - lower_bound - Lower bound. Accepts expression input. - Non-expression inputs are parsed as literals. - If set to `None` (default), no lower bound is applied. - upper_bound - Upper bound. Accepts expression input. - Non-expression inputs are parsed as literals. - If set to `None` (default), no upper bound is applied. - - See Also - -------- - when - - Notes - ----- - This method only works for numeric and temporal columns. To clip other data - types, consider writing a `when-then-otherwise` expression. See :func:`when`. - - Examples - -------- - Specifying both a lower and upper bound: - - >>> s = pl.Series([-50, 5, 50, None]) - >>> s.clip(1, 10) - shape: (4,) - Series: '' [i64] - [ - 1 - 5 - 10 - null - ] - - Specifying only a single bound: - - >>> s.clip(upper_bound=10) - shape: (4,) - Series: '' [i64] - [ - -50 - 5 - 10 - null - ] - - """ - def lower_bound(self) -> Self: - ''' - Return the lower bound of this Series\' dtype as a unit Series. - - See Also - -------- - upper_bound : return the upper bound of the given Series\' dtype. - - Examples - -------- - >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) - >>> s.lower_bound() - shape: (1,) - Series: \'s\' [i32] - [ - -2147483648 - ] - - >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) - >>> s.lower_bound() - shape: (1,) - Series: \'s\' [f32] - [ - -inf - ] - - ''' - def upper_bound(self) -> Self: - ''' - Return the upper bound of this Series\' dtype as a unit Series. - - See Also - -------- - lower_bound : return the lower bound of the given Series\' dtype. - - Examples - -------- - >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) - >>> s.upper_bound() - shape: (1,) - Series: \'s\' [i8] - [ - 127 - ] - - >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) - >>> s.upper_bound() - shape: (1,) - Series: \'s\' [f64] - [ - inf - ] - - ''' - def replace(self, mapping: dict[Any, Any]) -> Self: - ''' - Replace values according to the given mapping. - - Needs a global string cache for lazily evaluated queries on columns of - type `Categorical`. - - Parameters - ---------- - mapping - Mapping of values to their replacement. - default - Value to use when the mapping does not contain the lookup value. - Defaults to keeping the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - - See Also - -------- - str.replace - - Examples - -------- - Replace a single value by another value. Values not in the mapping remain - unchanged. - - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.replace({2: 100}) - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 100 - 100 - 3 - ] - - Replace multiple values. Specify a default to set values not in the given map - to the default value. - - >>> s = pl.Series("country_code", ["FR", "ES", "DE", None]) - >>> country_code_map = { - ... "CA": "Canada", - ... "DE": "Germany", - ... "FR": "France", - ... None: "unspecified", - ... } - >>> s.replace(country_code_map, default=None) - shape: (4,) - Series: \'country_code\' [str] - [ - "France" - null - "Germany" - "unspecified" - ] - - The return type can be overridden with the `return_dtype` argument. - - >>> s = pl.Series("a", [0, 1, 2, 3]) - >>> s.replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) - shape: (4,) - Series: \'a\' [u8] - [ - 0 - 10 - 20 - 0 - ] - ''' - def reshape(self, dimensions: tuple[int, ...]) -> Series: - ''' - Reshape this Series to a flat Series or a Series of Lists. - - Parameters - ---------- - dimensions - Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that - dimension is inferred. - - Returns - ------- - Series - If a single dimension is given, results in a Series of the original - data type. - If a multiple dimensions are given, results in a Series of data type - :class:`List` with shape (rows, cols). - - See Also - -------- - Series.list.explode : Explode a list column. - - Examples - -------- - >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) - >>> s.reshape((3, 3)) - shape: (3,) - Series: \'foo\' [list[i64]] - [ - [1, 2, 3] - [4, 5, 6] - [7, 8, 9] - ] - - ''' - def shuffle(self, seed: int | None = ...) -> Series: - ''' - Shuffle the contents of this Series. - - Parameters - ---------- - seed - Seed for the random number generator. If set to None (default), a - random seed is generated each time the shuffle is called. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.shuffle(seed=1) - shape: (3,) - Series: \'a\' [i64] - [ - 2 - 1 - 3 - ] - - ''' - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: - """ - Exponentially-weighted moving average. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> s = pl.Series([1, 2, 3]) - >>> s.ewm_mean(com=1) - shape: (3,) - Series: '' [f64] - [ - 1.0 - 1.666667 - 2.428571 - ] - - """ - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: - ''' - Exponentially-weighted moving standard deviation. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.ewm_std(com=1) - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 0.707107 - 0.963624 - ] - - ''' - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: - ''' - Exponentially-weighted moving variance. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.ewm_var(com=1) - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 0.5 - 0.928571 - ] - - ''' - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: - """ - Extremely fast method for extending the Series with 'n' copies of a value. - - Parameters - ---------- - value - A constant literal value (not an expression) with which to extend - the Series; can pass None to extend with nulls. - n - The number of additional values that will be added. - - Examples - -------- - >>> s = pl.Series([1, 2, 3]) - >>> s.extend_constant(99, n=2) - shape: (5,) - Series: '' [i64] - [ - 1 - 2 - 3 - 99 - 99 - ] - - """ - def set_sorted(self) -> Self: - ''' - Flags the Series as \'sorted\'. - - Enables downstream code to user fast paths for sorted arrays. - - Parameters - ---------- - descending - If the `Series` order is descending. - - Warnings - -------- - This can lead to incorrect results if this `Series` is not sorted!! - Use with care! - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.set_sorted().max() - 3 - - ''' - def new_from_index(self, index: int, length: int) -> Self: - """Create a new Series filled with values from the given index.""" - def shrink_dtype(self) -> Series: - """ - Shrink numeric columns to the minimal required datatype. - - Shrink to the dtype needed to fit the extrema of this [`Series`]. - This can be used to reduce memory pressure. - """ - def get_chunks(self) -> list[Series]: - """Get the chunks of this Series as a list of Series.""" - def implode(self) -> Self: - """Aggregate values into a list.""" - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - """ - Apply a custom/user-defined function (UDF) over elements in this Series. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Series.map_elements`. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output datatype. If none is given, the same datatype as this Series will be - used. - skip_nulls - Nulls will be skipped and not passed to the python function. - This is faster because python can be skipped and because we call - more specialized functions. - - """ - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - """ - Apply a custom rolling window function. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Series.rolling_map`. - - Parameters - ---------- - function - Aggregation function - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - """ - def is_first(self) -> Series: - """ - Return a boolean mask indicating the first occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Series.is_first_distinct`. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - """ - def is_last(self) -> Series: - """ - Return a boolean mask indicating the last occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Series.is_last_distinct`. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - """ - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: - """ - Clip (limit) the values in an array to a `min` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - lower_bound - Lower bound. - - """ - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: - """ - Clip (limit) the values in an array to a `max` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - upper_bound - Upper bound. - - """ - def shift_and_fill(self, fill_value: int | Expr) -> Series: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - Fill None values with the result of this expression. - n - Number of places to shift (may be negative). - - """ - def is_float(self) -> bool: - ''' - Check if this Series has floating point numbers. - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_float()` instead. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0]) - >>> s.is_float() # doctest: +SKIP - True - - ''' - def is_integer(self, signed: bool | None = ...) -> bool: - ''' - Check if this Series datatype is an integer (signed or unsigned). - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_integer()` instead. - For signed/unsigned variants, use `Series.dtype.is_signed_integer()` - or `Series.dtype.is_unsigned_integer()`. - - Parameters - ---------- - signed - * if `None`, both signed and unsigned integer dtypes will match. - * if `True`, only signed integer dtypes will be considered a match. - * if `False`, only unsigned integer dtypes will be considered a match. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) - >>> s.is_integer() # doctest: +SKIP - True - >>> s.is_integer(signed=False) # doctest: +SKIP - True - >>> s.is_integer(signed=True) # doctest: +SKIP - False - - ''' - def is_numeric(self) -> bool: - ''' - Check if this Series datatype is numeric. - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_float()` instead. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.is_numeric() # doctest: +SKIP - True - - ''' - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: - """ - Check if this Series datatype is temporal. - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_temporal()` instead. - - Parameters - ---------- - excluding - Optionally exclude one or more temporal dtypes from matching. - - Examples - -------- - >>> from datetime import date - >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) - >>> s.is_temporal() # doctest: +SKIP - True - >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP - False - - """ - def is_boolean(self) -> bool: - ''' - Check if this Series is a Boolean. - - .. deprecated:: 0.19.14 - Use `Series.dtype == pl.Boolean` instead. - - Examples - -------- - >>> s = pl.Series("a", [True, False, True]) - >>> s.is_boolean() # doctest: +SKIP - True - - ''' - def is_utf8(self) -> bool: - ''' - Check if this Series datatype is a Utf8. - - .. deprecated:: 0.19.14 - Use `Series.dtype == pl.Utf8` instead. - - Examples - -------- - >>> s = pl.Series("x", ["a", "b", "c"]) - >>> s.is_utf8() # doctest: +SKIP - True - - ''' - def take_every(self, n: int) -> Series: - """ - Take every nth value in the Series and return as new Series. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: - """ - Take values by index. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather`. - - Parameters - ---------- - indices - Index location used for selection. - """ - def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: - """ - Set values at the index locations. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`scatter`. - - Parameters - ---------- - indices - Integers representing the index locations. - values - Replacement values. - """ - def cumsum(self) -> Series: - """ - Get an array with the cumulative sum computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_sum`. - - Parameters - ---------- - reverse - reverse the operation. - - """ - def cummax(self) -> Series: - """ - Get an array with the cumulative max computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_max`. - - Parameters - ---------- - reverse - reverse the operation. - """ - def cummin(self) -> Series: - """ - Get an array with the cumulative min computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_min`. - - Parameters - ---------- - reverse - reverse the operation. - """ - def cumprod(self) -> Series: - """ - Get an array with the cumulative product computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_prod`. - - Parameters - ---------- - reverse - reverse the operation. - """ - def view(self) -> SeriesView: - """ - Get a view into this Series data with a numpy array. - - .. deprecated:: 0.19.14 - This method will be removed in a future version. - - This operation doesn't clone data, but does not include missing values. - Don't use this unless you know what you are doing. - - Parameters - ---------- - ignore_nulls - If True then nulls are converted to 0. - If False then an Exception is raised if nulls are present. - - """ - def map_dict(self, mapping: dict[Any, Any]) -> Self: - """ - Replace values in the Series using a remapping dictionary. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`replace`. The default behavior - has changed to keep any values not present in the mapping unchanged. - Pass `default=None` to keep existing behavior. - - Parameters - ---------- - mapping - Dictionary containing the before/after values to map. - default - Value to use when the remapping dict does not contain the lookup value. - Use `pl.first()`, to keep the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - """ - def series_equal(self, other: Series) -> bool: - """ - Check whether the Series is equal to another Series. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`equals`. - - Parameters - ---------- - other - Series to compare with. - null_equal - Consider null values as equal. - strict - Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a - `pl.Int64` will return `False`. - """ - @property - def dtype(self): ... - @property - def flags(self): ... - @property - def inner_dtype(self): ... - @property - def name(self): ... - @property - def shape(self): ... - @property - def bin(self): ... - @property - def cat(self): ... - @property - def dt(self): ... - @property - def list(self): ... - @property - def arr(self): ... - @property - def str(self): ... - @property - def struct(self): ... -def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: - """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/series/series.pyi new file mode 100644 index 0000000..0dd0f70 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.2/polars/series/series.pyi @@ -0,0 +1,5035 @@ +#: version 0.20.2 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Enum as Enum, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Null as Null, Object as Object, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown, Utf8 as Utf8 +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat +from polars.exceptions import ModuleUpgradeRequired as ModuleUpgradeRequired, ShapeError as ShapeError +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, _warn_null_comparison as _warn_null_comparison, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_ptr(self) -> tuple[int, int, int]: + """ + Get a pointer to the start of the values buffer of a numeric Series. + + This will raise an error if the `Series` contains multiple chunks. + + This will return the offset, length and the pointer itself. + + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series <= other`.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series < other`.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series == other`.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series == other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series != other`.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series != other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series >= other`.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series > other`.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + ''' + Return the Series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to `s[0]`, with a check + that the shape is (1,). With an index, this is equivalent to `s[index]`. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cum_sum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """Compute the logarithm to a given base.""" + def log1p(self) -> Series: + """Compute the natural logarithm of the input array plus one, element-wise.""" + def log10(self) -> Series: + """Compute the base 10 logarithm of the input array, element-wise.""" + def exp(self) -> Series: + """Compute the exponential, element-wise.""" + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a Series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + Series has a numeric dtype). All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> s = pl.Series([1, 2, 3, 4, 5]) + >>> s.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + Non-numeric data types may not have all statistics available. + + >>> s = pl.Series(["a", "a", None, "b", "c"]) + >>> s.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 4 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + include_breakpoint + Include a column that indicates the upper breakpoint. + include_category + Include a column that shows the intervals as categories. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬───────┐ + │ break_point ┆ category ┆ count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═══════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴───────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬───────┐ + │ color ┆ count │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═══════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴───────┘ + + Sort the output by count. + + >>> s.value_counts(sort=True) + shape: (3, 2) + ┌───────┬───────┐ + │ color ┆ count │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═══════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴───────┘ + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cum_max(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cum_max() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cum_min(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cum_min() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cum_prod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_prod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cum_sum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_sum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + The resulting series will consist of multiple chunks. + + Parameters + ---------- + other + Series to append. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from `append`, which adds the chunks from `other` to the chunks of + this series, `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer `append` over `extend` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single `Series`. In the latter case, finish the sequence + of `append` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + head + + """ + def gather_every(self, n: int) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """Count the null values in this Series.""" + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no `null` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have `null` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be `false`. + + To confirm that a column has `null` values use :func:`null_count`. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def equals(self, other: Series) -> bool: + ''' + Check whether the Series is equal to another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + See Also + -------- + assert_series_equal + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s1.equals(s1) + True + >>> s1.equals(s2) + False + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point `nan` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set `zero_copy_only=True`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def _view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + + Returns + ------- + SeriesView + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s._view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def count(self) -> int: + ''' + Return the number of non-null elements in the column. + + See Also + -------- + len + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.count() + 2 + ''' + def len(self) -> int: + ''' + Return the number of elements in the Series. + + Null values count towards the total. + + See Also + -------- + count + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.len() + 3 + ''' + def set(self, filter: Series, value: int | float | str | bool | None) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def scatter(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimization (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.scatter(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Create a copy of this Series. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def round_sig_figs(self, digits: int) -> Series: + """ + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> s = pl.Series([0.01234, 3.333, 1234.0]) + >>> s.round_sig_figs(2) + shape: (3,) + Series: '' [f64] + [ + 0.012 + 3.3 + 1200.0 + ] + + """ + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def cot(self) -> Series: + ''' + Compute the element-wise value for the cotangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cot() + shape: (3,) + Series: \'a\' [f64] + [ + inf + 6.1232e-17 + -8.1656e15 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, n: int = ...) -> Series: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> s = pl.Series([1, 2, 3, 4]) + >>> s.shift() + shape: (4,) + Series: '' [i64] + [ + null + 1 + 2 + 3 + ] + + Pass a negative value to shift in the opposite direction instead. + + >>> s.shift(-2) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + null + null + ] + + Specify `fill_value` to fill the resulting null values. + + >>> s.shift(-2, fill_value=100) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + 100 + 100 + ] + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their std dev. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: + """ + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no lower bound is applied. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no upper bound is applied. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> s = pl.Series([-50, 5, 50, None]) + >>> s.clip(1, 10) + shape: (4,) + Series: '' [i64] + [ + 1 + 5 + 10 + null + ] + + Specifying only a single bound: + + >>> s.clip(upper_bound=10) + shape: (4,) + Series: '' [i64] + [ + -50 + 5 + 10 + null + ] + + """ + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def replace(self, old: IntoExpr | Sequence[Any] | Mapping[Any, Any], new: IntoExpr | Sequence[Any] | NoDefault = ...) -> Self: + ''' + Replace values by different values. + + Parameters + ---------- + old + Value or sequence of values to replace. + Also accepts a mapping of values to their replacement as syntactic sugar for + `replace(new=Series(mapping.keys()), old=Series(mapping.values()))`. + new + Value or sequence of values to replace by. + Length must match the length of `old` or have length 1. + default + Set values that were not replaced to this value. + Defaults to keeping the original value. + Accepts expression input. Non-expression inputs are parsed as literals. + return_dtype + The data type of the resulting Series. If set to `None` (default), + the data type is determined automatically based on the other inputs. + + See Also + -------- + str.replace + + Notes + ----- + The global string cache must be enabled when replacing categorical values. + + Examples + -------- + Replace a single value by another value. Values that were not replaced remain + unchanged. + + >>> s = pl.Series([1, 2, 2, 3]) + >>> s.replace(2, 100) + shape: (4,) + Series: \'\' [i64] + [ + 1 + 100 + 100 + 3 + ] + + Replace multiple values by passing sequences to the `old` and `new` parameters. + + >>> s.replace([2, 3], [100, 200]) + shape: (4,) + Series: \'\' [i64] + [ + 1 + 100 + 100 + 200 + ] + + Passing a mapping with replacements is also supported as syntactic sugar. + Specify a default to set all values that were not matched. + + >>> mapping = {2: 100, 3: 200} + >>> s.replace(mapping, default=-1) + shape: (4,) + Series: \'\' [i64] + [ + -1 + 100 + 100 + 200 + ] + + + The default can be another Series. + + >>> default = pl.Series([2.5, 5.0, 7.5, 10.0]) + >>> s.replace(2, 100, default=default) + shape: (4,) + Series: \'\' [f64] + [ + 2.5 + 100.0 + 100.0 + 10.0 + ] + + Replacing by values of a different data type sets the return type based on + a combination of the `new` data type and either the original data type or the + default data type if it was set. + + >>> s = pl.Series(["x", "y", "z"]) + >>> mapping = {"x": 1, "y": 2, "z": 3} + >>> s.replace(mapping) + shape: (3,) + Series: \'\' [str] + [ + "1" + "2" + "3" + ] + >>> s.replace(mapping, default=None) + shape: (3,) + Series: \'\' [i64] + [ + 1 + 2 + 3 + ] + + Set the `return_dtype` parameter to control the resulting data type directly. + + >>> s.replace(mapping, return_dtype=pl.UInt8) + shape: (3,) + Series: \'\' [u8] + [ + 1 + 2 + 3 + ] + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.ewm_mean(com=1) + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.666667 + 2.428571 + ] + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to window size. + center + Set the labels at the center of the window + + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() # doctest: +SKIP + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_integer()` instead. + For signed/unsigned variants, use `Series.dtype.is_signed_integer()` + or `Series.dtype.is_unsigned_integer()`. + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() # doctest: +SKIP + True + >>> s.is_integer(signed=False) # doctest: +SKIP + True + >>> s.is_integer(signed=True) # doctest: +SKIP + False + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_numeric()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() # doctest: +SKIP + True + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_temporal()` instead. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() # doctest: +SKIP + True + >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP + False + + """ + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Boolean` instead. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() # doctest: +SKIP + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a Utf8. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Utf8` instead. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() # doctest: +SKIP + True + + ''' + def take_every(self, n: int) -> Series: + """ + Take every nth value in the Series and return as new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + Index location used for selection. + """ + def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + """ + Set values at the index locations. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`scatter`. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + """ + def cumsum(self) -> Series: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + reverse the operation. + + """ + def cummax(self) -> Series: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummin(self) -> Series: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cumprod(self) -> Series: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def view(self) -> SeriesView: + """ + Get a view into this Series data with a numpy array. + + .. deprecated:: 0.19.14 + This method will be removed in a future version. + + This operation doesn't clone data, but does not include missing values. + Don't use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in the Series using a remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + """ + def series_equal(self, other: Series) -> bool: + """ + Check whether the Series is equal to another Series. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`equals`. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: + """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/dataframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/dataframe/frame deleted file mode 100644 index 562effd..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/dataframe/frame +++ /dev/null @@ -1,6977 +0,0 @@ -import P -import deltalake -import np as np -import pa as pa -import pd as pd -from _io import BytesIO, TextIOWrapper - -from builtins import PyDataFrame -from pathlib import Path -from polars.dataframe._html import NotebookFormatter as NotebookFormatter -from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy -from polars.datatypes.classes import Boolean as Boolean, Float64 as Float64, NUMERIC_DTYPES as NUMERIC_DTYPES, Object as Object, Utf8 as Utf8 -from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat -from polars.exceptions import NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError -from polars.functions.col import col as col -from polars.functions.lit import lit as lit -from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file -from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte -from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name -from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors -from polars.slice import PolarsSlice as PolarsSlice -from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression -from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s -from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, can_create_dicts_with_pyarrow as can_create_dicts_with_pyarrow, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes -from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, Iterable, Iterator, Mapping, NoReturn, Sequence - -TYPE_CHECKING: bool -INTEGER_DTYPES: frozenset -N_INFER_DEFAULT: int -_PYARROW_AVAILABLE: bool -_dtype_str_repr: builtin_function_or_method - -class DataFrame: - _accessors: _ClassVar[set] = ... - columns: Incomplete - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... - @classmethod - def _from_pydf(cls, py_df: PyDataFrame) -> Self: - """Construct Polars DataFrame from FFI PyDataFrame object.""" - @classmethod - def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... - @classmethod - def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from a dictionary of sequences. - - Parameters - ---------- - data : dict of sequences - Two-dimensional data represented as a dictionary. dict must contain - Sequences. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - - """ - @classmethod - def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from a sequence of sequences. - - Parameters - ---------- - data : Sequence of sequences - Two-dimensional data represented as a sequence of sequences. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - orient : {'col', 'row'}, default None - Whether to interpret two-dimensional data as columns or as rows. If None, - the orientation is inferred by matching the columns and data dimensions. If - this does not yield conclusive results, column orientation is used. - infer_schema_length - How many rows to scan to determine the column type. - - """ - @classmethod - def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from a numpy ndarray. - - Parameters - ---------- - data : numpy ndarray - Two-dimensional data represented as a numpy ndarray. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - orient : {'col', 'row'}, default None - Whether to interpret two-dimensional data as columns or as rows. If None, - the orientation is inferred by matching the columns and data dimensions. If - this does not yield conclusive results, column orientation is used. - - """ - @classmethod - def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a DataFrame from an Arrow table. - - This operation will be zero copy for the most part. Types that are not - supported by Polars may be cast to the closest supported type. - - Parameters - ---------- - data : arrow table, array, or sequence of sequences - Data representing an Arrow Table or Array. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - rechunk : bool, default True - Make sure that all data is in contiguous memory. - - """ - @classmethod - def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: - """ - Construct a Polars DataFrame from a pandas DataFrame. - - Parameters - ---------- - data : pandas DataFrame - Two-dimensional data represented as a pandas DataFrame. - schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The DataFrame schema may be declared in several ways: - - * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. - * As a list of column names; in this case types are automatically inferred. - * As a list of (name,type) pairs; this is equivalent to the dictionary form. - - If you supply a list of column names that does not match the names in the - underlying data, the names given here will overwrite them. The number - of names given in the schema should match the underlying data dimensions. - schema_overrides : dict, default None - Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. - rechunk : bool, default True - Make sure that all data is in contiguous memory. - nan_to_null : bool, default True - If the data contains NaN values they will be converted to null/None. - include_index : bool, default False - Load any non-default pandas indexes as columns. - - """ - @classmethod - def _read_csv(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: - """ - Read a CSV file into a DataFrame. - - Use `pl.read_csv` to dispatch to this method. - - See Also - -------- - polars.io.read_csv - - """ - @classmethod - def _read_parquet(cls, source: str | Path | BinaryIO | bytes) -> DataFrame: - """ - Read into a DataFrame from a parquet file. - - Use `pl.read_parquet` to dispatch to this method. - - See Also - -------- - polars.io.read_parquet - - """ - @classmethod - def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: - """ - Read into a DataFrame from Apache Avro format. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - columns - Columns. - n_rows - Stop reading from Apache Avro file after reading `n_rows`. - - """ - @classmethod - def _read_ipc(cls, source: str | Path | BinaryIO | bytes) -> Self: - ''' - Read into a DataFrame from Arrow IPC file format. - - See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. - Arrow IPC files are also known as Feather (v2) files. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - columns - Columns to select. Accepts a list of column indices (starting at zero) or a - list of column names. - n_rows - Stop reading from IPC file after reading `n_rows`. - row_count_name - Row count name. - row_count_offset - Row count offset. - rechunk - Make sure that all data is contiguous. - memory_map - Memory map the file - - ''' - @classmethod - def _read_ipc_stream(cls, source: str | Path | BinaryIO | bytes) -> Self: - ''' - Read into a DataFrame from Arrow IPC record batch stream format. - - See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - columns - Columns to select. Accepts a list of column indices (starting at zero) or a - list of column names. - n_rows - Stop reading from IPC stream after reading `n_rows`. - row_count_name - Row count name. - row_count_offset - Row count offset. - rechunk - Make sure that all data is contiguous. - - ''' - @classmethod - def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: - """ - Read into a DataFrame from a JSON file. - - Use `pl.read_json` to dispatch to this method. - - See Also - -------- - polars.io.read_json - - """ - @classmethod - def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: - """ - Read into a DataFrame from a newline delimited JSON file. - - Use `pl.read_ndjson` to dispatch to this method. - - See Also - -------- - polars.io.read_ndjson - - """ - def _replace(self, column: str, new_column: Series) -> Self: - """Replace a column by a new Series (in place).""" - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: - """ - Numpy __array__ interface protocol. - - Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see - https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. - """ - def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: - ''' - Convert to a dataframe object implementing the dataframe interchange protocol. - - Parameters - ---------- - nan_as_null - Overwrite null values in the data with `NaN`. - - .. warning:: - This functionality has not been implemented and the parameter will be - removed in a future version. - Setting this to `True` will raise a `NotImplementedError`. - allow_copy - Allow memory to be copied to perform the conversion. If set to `False`, - causes conversions that are not zero-copy to fail. - - Notes - ----- - Details on the Python dataframe interchange protocol: - https://data-apis.org/dataframe-protocol/latest/index.html - - Examples - -------- - Convert a Polars DataFrame to a generic dataframe object and access some - properties. - - >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) - >>> dfi = df.__dataframe__() - >>> dfi.num_rows() - 2 - >>> dfi.get_column(1).dtype - (, 64, \'g\', \'=\') - - ''' - def __dataframe_consortium_standard__(self) -> Any: - """ - Provide entry point to the Consortium DataFrame Standard API. - - This is developed and maintained outside of polars. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. - """ - def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: - """Compare a DataFrame with another object.""" - def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: - """Compare a DataFrame with another DataFrame.""" - def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: - """Compare a DataFrame with a non-DataFrame object.""" - def _div(self, other: Any) -> DataFrame: ... - def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... - def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... - def __bool__(self) -> NoReturn: ... - def __eq__(self, other: Any) -> DataFrame: ... - def __ne__(self, other: Any) -> DataFrame: ... - def __gt__(self, other: Any) -> DataFrame: ... - def __lt__(self, other: Any) -> DataFrame: ... - def __ge__(self, other: Any) -> DataFrame: ... - def __le__(self, other: Any) -> DataFrame: ... - def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... - def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... - def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... - def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... - def __contains__(self, key: str) -> bool: ... - def __iter__(self) -> Iterator[Series]: ... - def __reversed__(self) -> Iterator[Series]: ... - def _pos_idx(self, idx: int, dim: int) -> int: ... - def _take_with_series(self, s: Series) -> DataFrame: ... - def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: - """Get item. Does quite a lot. Read the comments.""" - def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... - def __len__(self) -> int: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def _ipython_key_completions_(self) -> list[str]: ... - def _repr_html_(self, **kwargs: Any) -> str: - """ - Format output data in HTML for display in Jupyter Notebooks. - - Output rows and columns can be modified by setting the following ENVIRONMENT - variables: - - * POLARS_FMT_MAX_COLS: set the number of columns - * POLARS_FMT_MAX_ROWS: set the number of rows - - """ - def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: - ''' - Return the DataFrame as a scalar, or return the element at the given row/column. - - Parameters - ---------- - row - Optional row index. - column - Optional column index or name. - - See Also - -------- - row: Get the values of a single row, either by index or by predicate. - - Notes - ----- - If row/col not provided, this is equivalent to `df[0,0]`, with a check that - the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> df.select((pl.col("a") * pl.col("b")).sum()).item() - 32 - >>> df.item(1, 1) - 5 - >>> df.item(2, "b") - 6 - - ''' - def to_arrow(self) -> pa.Table: - ''' - Collect the underlying arrow arrays in an Arrow Table. - - This operation is mostly zero copy. - - Data types that do copy: - - CategoricalType - - Examples - -------- - >>> df = pl.DataFrame( - ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} - ... ) - >>> df.to_arrow() - pyarrow.Table - foo: int64 - bar: large_string - ---- - foo: [[1,2,3,4,5,6]] - bar: [["a","b","c","d","e","f"]] - - ''' - def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: - ''' - Convert DataFrame to a dictionary mapping column name to values. - - Parameters - ---------- - as_series - True -> Values are Series - False -> Values are List[Any] - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4, 5], - ... "fruits": ["banana", "banana", "apple", "apple", "banana"], - ... "B": [5, 4, 3, 2, 1], - ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], - ... "optional": [28, 300, None, 2, -30], - ... } - ... ) - >>> df - shape: (5, 5) - ┌─────┬────────┬─────┬────────┬──────────┐ - │ A ┆ fruits ┆ B ┆ cars ┆ optional │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ - ╞═════╪════════╪═════╪════════╪══════════╡ - │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ - │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ - │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ - │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ - │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ - └─────┴────────┴─────┴────────┴──────────┘ - >>> df.to_dict(as_series=False) - {\'A\': [1, 2, 3, 4, 5], - \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], - \'B\': [5, 4, 3, 2, 1], - \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], - \'optional\': [28, 300, None, 2, -30]} - >>> df.to_dict(as_series=True) - {\'A\': shape: (5,) - Series: \'A\' [i64] - [ - 1 - 2 - 3 - 4 - 5 - ], \'fruits\': shape: (5,) - Series: \'fruits\' [str] - [ - "banana" - "banana" - "apple" - "apple" - "banana" - ], \'B\': shape: (5,) - Series: \'B\' [i64] - [ - 5 - 4 - 3 - 2 - 1 - ], \'cars\': shape: (5,) - Series: \'cars\' [str] - [ - "beetle" - "audi" - "beetle" - "beetle" - "beetle" - ], \'optional\': shape: (5,) - Series: \'optional\' [i64] - [ - 28 - 300 - null - 2 - -30 - ]} - - ''' - def to_dicts(self) -> list[dict[str, Any]]: - ''' - Convert every row to a dictionary of Python-native values. - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.to_dicts() - [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] - - ''' - def to_numpy(self) -> np.ndarray[Any, Any]: - ''' - Convert DataFrame to a 2D NumPy array. - - This operation clones data. - - Parameters - ---------- - structured - Optionally return a structured array, with field names and - dtypes that correspond to the DataFrame schema. - order - The index order of the returned NumPy array, either C-like or - Fortran-like. In general, using the Fortran-like index order is faster. - However, the C-like order might be more appropriate to use for downstream - applications to prevent cloning data, e.g. when reshaping into a - one-dimensional array. Note that this option only takes effect if - `structured` is set to `False` and the DataFrame dtypes allow for a - global dtype for all columns. - - Notes - ----- - If you\'re attempting to convert Utf8 to an array you\'ll need to install - `pyarrow`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.5, 7.0, 8.5], - ... "ham": ["a", "b", "c"], - ... }, - ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, - ... ) - - Export to a standard 2D numpy array. - - >>> df.to_numpy() - array([[1, 6.5, \'a\'], - [2, 7.0, \'b\'], - [3, 8.5, \'c\']], dtype=object) - - Export to a structured array, which can better-preserve individual - column data, such as name and dtype... - - >>> df.to_numpy(structured=True) - array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], - dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np - >>> df.to_numpy(structured=True).view(np.recarray) - rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], - dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: - ''' - Cast to a pandas DataFrame. - - This requires that :mod:`pandas` and :mod:`pyarrow` are installed. - This operation clones data, unless `use_pyarrow_extension_array=True`. - - Parameters - ---------- - use_pyarrow_extension_array - Use PyArrow backed-extension arrays instead of numpy arrays for each column - of the pandas DataFrame; this allows zero copy operations and preservation - of null values. Subsequent operations on the resulting pandas DataFrame may - trigger conversion to NumPy arrays if that operation is not supported by - pyarrow compute functions. - **kwargs - Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. - - Returns - ------- - :class:`pandas.DataFrame` - - Examples - -------- - >>> import pandas - >>> df1 = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> pandas_df1 = df1.to_pandas() - >>> type(pandas_df1) - - >>> pandas_df1.dtypes - foo int64 - bar int64 - ham object - dtype: object - >>> df2 = pl.DataFrame( - ... { - ... "foo": [1, 2, None], - ... "bar": [6, None, 8], - ... "ham": [None, "b", "c"], - ... } - ... ) - >>> pandas_df2 = df2.to_pandas() - >>> pandas_df2 - foo bar ham - 0 1.0 6.0 None - 1 2.0 NaN b - 2 NaN 8.0 c - >>> pandas_df2.dtypes - foo float64 - bar float64 - ham object - dtype: object - >>> pandas_df2_pa = df2.to_pandas( - ... use_pyarrow_extension_array=True - ... ) # doctest: +SKIP - >>> pandas_df2_pa # doctest: +SKIP - foo bar ham - 0 1 6 - 1 2 b - 2 8 c - >>> pandas_df2_pa.dtypes # doctest: +SKIP - foo int64[pyarrow] - bar int64[pyarrow] - ham large_string[pyarrow] - dtype: object - - ''' - def to_series(self, index: int = ...) -> Series: - ''' - Select column as Series at index location. - - Parameters - ---------- - index - Location of selection. - - See Also - -------- - get_column - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.to_series(1) - shape: (3,) - Series: \'bar\' [i64] - [ - 6 - 7 - 8 - ] - - ''' - def to_init_repr(self, n: int = ...) -> str: - ''' - Convert DataFrame to instantiatable string representation. - - Parameters - ---------- - n - Only use first n rows. - - See Also - -------- - polars.Series.to_init_repr - polars.from_repr - - Examples - -------- - >>> df = pl.DataFrame( - ... [ - ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), - ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), - ... pl.Series("ham", ["a", "b", "c"], dtype=pl.Categorical), - ... ] - ... ) - >>> print(df.to_init_repr()) - pl.DataFrame( - [ - pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), - pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), - pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.Categorical), - ] - ) - - >>> df_from_str_repr = eval(df.to_init_repr()) - >>> df_from_str_repr - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ f32 ┆ cat │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 7.0 ┆ b │ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: - ''' - Serialize to JSON representation. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - If set to `None` (default), the output is returned as a string instead. - pretty - Pretty serialize json. - row_oriented - Write to row oriented json. This is slower, but more common. - - See Also - -------- - DataFrame.write_ndjson - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... } - ... ) - >>> df.write_json() - \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' - >>> df.write_json(row_oriented=True) - \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' - - ''' - def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: - ''' - Serialize to newline delimited JSON representation. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - If set to `None` (default), the output is returned as a string instead. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... } - ... ) - >>> df.write_ndjson() - \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' - - ''' - def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: - ''' - Write to comma-separated values (CSV) file. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - If set to `None` (default), the output is returned as a string instead. - include_bom - Whether to include UTF-8 BOM in the CSV output. - include_header - Whether to include header in the CSV output. - separator - Separate CSV fields with this symbol. - line_terminator - String used to end each row. - quote_char - Byte to use as quoting character. - batch_size - Number of rows that will be processed per thread. - datetime_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. If no format specified, the default fractional-second - precision is inferred from the maximum timeunit found in the frame\'s - Datetime cols (if any). - date_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - time_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - float_precision - Number of decimal places to write, applied to both `Float32` and - `Float64` datatypes. - null_value - A string representing null values (defaulting to the empty string). - quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} - Determines the quoting strategy used. - - - necessary (default): This puts quotes around fields only when necessary. - They are necessary when fields contain a quote, - separator or record terminator. - Quotes are also necessary when writing an empty record - (which is indistinguishable from a record with one empty field). - This is the default. - - always: This puts quotes around every field. Always. - - never: This never puts quotes around fields, even if that results in - invalid CSV data (e.g.: by not quoting strings containing the separator). - - non_numeric: This puts quotes around all fields that are non-numeric. - Namely, when writing a field that does not parse as a valid float - or integer, then quotes will be used even if they aren`t strictly - necessary. - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.csv" - >>> df.write_csv(path, separator=",") - - ''' - def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: - ''' - Write to Apache Avro file. - - Parameters - ---------- - file - File path or writeable file-like object to which the data will be written. - compression : {\'uncompressed\', \'snappy\', \'deflate\'} - Compression method. Defaults to "uncompressed". - name - Schema name. Defaults to empty string. - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.avro" - >>> df.write_avro(path) - - ''' - def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: - ''' - Write frame data to a table in an Excel workbook/worksheet. - - Parameters - ---------- - workbook : Workbook - String name or path of the workbook to create, BytesIO object to write - into, or an open `xlsxwriter.Workbook` object that has not been closed. - If None, writes to a `dataframe.xlsx` workbook in the working directory. - worksheet : str - Name of target worksheet; if None, writes to "Sheet1" when creating a new - workbook (note that writing to an existing workbook requires a valid - existing -or new- worksheet name). - position : {str, tuple} - Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. - table_style : {str, dict} - A named Excel table style, such as "Table Style Medium 4", or a dictionary - of `{"key":value,}` options containing one or more of the following keys: - "style", "first_column", "last_column", "banded_columns, "banded_rows". - table_name : str - Name of the output table object in the worksheet; can then be referred to - in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. - column_formats : dict - A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an - Excel format string to the given columns. Formats defined here (such as - "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. - dtype_formats : dict - A `{dtype:str,}` dictionary that sets the default Excel format for the - given dtype. (This can be overridden on a per-column basis by the - `column_formats` param). It is also valid to use dtype groups such as - `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform - integer and float formats. - conditional_formats : dict - A dictionary of colname (or selector) keys to a format str, dict, or list - that defines conditional formatting options for the specified columns. - - * If supplying a string typename, should be one of the valid `xlsxwriter` - types such as "3_color_scale", "data_bar", etc. - * If supplying a dictionary you can make use of any/all `xlsxwriter` - supported options, including icon sets, formulae, etc. - * Supplying multiple columns as a tuple/key will apply a single format - across all columns - this is effective in creating a heatmap, as the - min/max values will be determined across the entire range, not per-column. - * Finally, you can also supply a list made up from the above options - in order to apply *more* than one conditional format to the same range. - header_format : dict - A `{key:value,}` dictionary of `xlsxwriter` format options to apply - to the table header row, such as `{"bold":True, "font_color":"#702963"}`. - column_totals : {bool, list, dict} - Add a column-total row to the exported table. - - * If True, all numeric columns will have an associated total using "sum". - * If passing a string, it must be one of the valid total function names - and all numeric columns will have an associated total using that function. - * If passing a list of colnames, only those given will have a total. - * For more control, pass a `{colname:funcname,}` dict. - - Valid total function names are "average", "count_nums", "count", "max", - "min", "std_dev", "sum", and "var". - column_widths : {dict, int} - A `{colname:int,}` or `{selector:int,}` dict or a single integer that - sets (or overrides if autofitting) table column widths, in integer pixel - units. If given as an integer the same value is used for all table columns. - row_totals : {dict, bool} - Add a row-total column to the right-hand side of the exported table. - - * If True, a column called "total" will be added at the end of the table - that applies a "sum" function row-wise across all numeric columns. - * If passing a list/sequence of column names, only the matching columns - will participate in the sum. - * Can also pass a `{colname:columns,}` dictionary to create one or - more total columns with distinct names, referencing different columns. - row_heights : {dict, int} - An int or `{row_index:int,}` dictionary that sets the height of the given - rows (if providing a dictionary) or all rows (if providing an integer) that - intersect with the table body (including any header and total row) in - integer pixel units. Note that `row_index` starts at zero and will be - the header row (unless `include_header` is False). - sparklines : dict - A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more - sparklines to be written into a new column in the table. - - * If passing a list of colnames (used as the source of the sparkline data) - the default sparkline settings are used (eg: line chart with no markers). - * For more control an `xlsxwriter`-compliant options dict can be supplied, - in which case three additional polars-specific keys are available: - "columns", "insert_before", and "insert_after". These allow you to define - the source columns and position the sparkline(s) with respect to other - table columns. If no position directive is given, sparklines are added to - the end of the table (eg: to the far right) in the order they are given. - formulas : dict - A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or - more formulas to be written into a new column in the table. Note that you - are strongly advised to use structured references in your formulae wherever - possible to make it simple to reference columns by name. - - * If providing a string formula (such as "=[@colx]*[@coly]") the column will - be added to the end of the table (eg: to the far right), after any default - sparklines and before any row_totals. - * For the most control supply an options dictionary with the following keys: - "formula" (mandatory), one of "insert_before" or "insert_after", and - optionally "return_dtype". The latter is used to appropriately format the - output of the formula and allow it to participate in row/column totals. - float_precision : int - Default number of decimals displayed for floating point columns (note that - this is purely a formatting directive; the actual values are not rounded). - include_header : bool - Indicate if the table should be created with a header row. - autofilter : bool - If the table has headers, provide autofilter capability. - autofit : bool - Calculate individual column widths from the data. - hidden_columns : list - A list or selector representing table columns to hide in the worksheet. - hide_gridlines : bool - Do not display any gridlines on the output worksheet. - sheet_zoom : int - Set the default zoom level of the output worksheet. - freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) - Freeze workbook panes. - - * If (row, col) is supplied, panes are split at the top-left corner of the - specified cell, which are 0-indexed. Thus, to freeze only the top row, - supply (1, 0). - * Alternatively, cell notation can be used to supply the cell. For example, - "A2" indicates the split occurs at the top-left of cell A2, which is the - equivalent of (1, 0). - * If (row, col, top_row, top_col) are supplied, the panes are split based on - the `row` and `col`, and the scrolling region is inititalized to begin at - the `top_row` and `top_col`. Thus, to freeze only the top row and have the - scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). - Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. - - Notes - ----- - * A list of compatible `xlsxwriter` format property names can be found here: - https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties - - * Conditional formatting dictionaries should provide xlsxwriter-compatible - definitions; polars will take care of how they are applied on the worksheet - with respect to the relative sheet/column position. For supported options, - see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html - - * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible - key/values, as well as a mandatory polars "columns" key that defines the - sparkline source data; these source columns should all be adjacent. Two other - polars-specific keys are available to help define where the sparkline appears - in the table: "insert_after", and "insert_before". The value associated with - these keys should be the name of a column in the exported table. - https://xlsxwriter.readthedocs.io/working_with_sparklines.html - - * Formula dictionaries *must* contain a key called "formula", and then optional - "insert_after", "insert_before", and/or "return_dtype" keys. These additional - keys allow the column to be injected into the table at a specific location, - and/or to define the return type of the formula (eg: "Int64", "Float64", etc). - Formulas that refer to table columns should use Excel\'s structured references - syntax to ensure the formula is applied correctly and is table-relative. - https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e - - Examples - -------- - Instantiate a basic DataFrame: - - >>> from random import uniform - >>> from datetime import date - >>> - >>> df = pl.DataFrame( - ... { - ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], - ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], - ... "val": [10_000, 20_000, 30_000], - ... } - ... ) - - Export to "dataframe.xlsx" (the default workbook name, if not specified) in the - working directory, add column totals ("sum" by default) on all numeric columns, - then autofit: - - >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP - - Write frame to a specific location on the sheet, set a named table style, - apply US-style date formatting, increase default float precision, apply a - non-default total function to a single column, autofit: - - >>> df.write_excel( # doctest: +SKIP - ... position="B4", - ... table_style="Table Style Light 16", - ... dtype_formats={pl.Date: "mm/dd/yyyy"}, - ... column_totals={"num": "average"}, - ... float_precision=6, - ... autofit=True, - ... ) - - Write the same frame to a named worksheet twice, applying different styles - and conditional formatting to each table, adding table titles using explicit - xlsxwriter integration: - - >>> from xlsxwriter import Workbook - >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP - ... # basic/default conditional formatting - ... df.write_excel( - ... workbook=wb, - ... worksheet="data", - ... position=(3, 1), # specify position as (row,col) coordinates - ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, - ... table_style="Table Style Medium 4", - ... ) - ... - ... # advanced conditional formatting, custom styles - ... df.write_excel( - ... workbook=wb, - ... worksheet="data", - ... position=(len(df) + 7, 1), - ... table_style={ - ... "style": "Table Style Light 4", - ... "first_column": True, - ... }, - ... conditional_formats={ - ... "num": { - ... "type": "3_color_scale", - ... "min_color": "#76933c", - ... "mid_color": "#c4d79b", - ... "max_color": "#ebf1de", - ... }, - ... "val": { - ... "type": "data_bar", - ... "data_bar_2010": True, - ... "bar_color": "#9bbb59", - ... "bar_negative_color_same": True, - ... "bar_negative_border_color_same": True, - ... }, - ... }, - ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, - ... column_widths={"val": 125}, - ... autofit=True, - ... ) - ... - ... # add some table titles (with a custom format) - ... ws = wb.get_worksheet_by_name("data") - ... fmt_title = wb.add_format( - ... { - ... "font_color": "#4f6228", - ... "font_size": 12, - ... "italic": True, - ... "bold": True, - ... } - ... ) - ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) - ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) - ... - - Export a table containing two different types of sparklines. Use default - options for the "trend" sparkline and customised options (and positioning) - for the "+/-" win_loss sparkline, with non-default integer dtype formatting, - column totals, a subtle two-tone heatmap and hidden worksheet gridlines: - - >>> df = pl.DataFrame( - ... { - ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], - ... "q1": [100, 55, -20, 0, 35], - ... "q2": [30, -10, 15, 60, 20], - ... "q3": [-50, 0, 40, 80, 80], - ... "q4": [75, 55, 25, -10, -55], - ... } - ... ) - >>> df.write_excel( # doctest: +SKIP - ... table_style="Table Style Light 2", - ... # apply accounting format to all flavours of integer - ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, - ... sparklines={ - ... # default options; just provide source cols - ... "trend": ["q1", "q2", "q3", "q4"], - ... # customised sparkline type, with positioning directive - ... "+/-": { - ... "columns": ["q1", "q2", "q3", "q4"], - ... "insert_after": "id", - ... "type": "win_loss", - ... }, - ... }, - ... conditional_formats={ - ... # create a unified multi-column heatmap - ... ("q1", "q2", "q3", "q4"): { - ... "type": "2_color_scale", - ... "min_color": "#95b3d7", - ... "max_color": "#ffffff", - ... }, - ... }, - ... column_totals=["q1", "q2", "q3", "q4"], - ... row_totals=True, - ... hide_gridlines=True, - ... ) - - Export a table containing an Excel formula-based column that calculates a - standardised Z-score, showing use of structured references in conjunction - with positioning directives, column totals, and custom formatting. - - >>> df = pl.DataFrame( - ... { - ... "id": ["a123", "b345", "c567", "d789", "e101"], - ... "points": [99, 45, 50, 85, 35], - ... } - ... ) - >>> df.write_excel( # doctest: +SKIP - ... table_style={ - ... "style": "Table Style Medium 15", - ... "first_column": True, - ... }, - ... column_formats={ - ... "id": {"font": "Consolas"}, - ... "points": {"align": "center"}, - ... "z-score": {"align": "center"}, - ... }, - ... column_totals="average", - ... formulas={ - ... "z-score": { - ... # use structured references to refer to the table columns and \'totals\' row - ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", - ... "insert_after": "points", - ... "return_dtype": pl.Float64, - ... } - ... }, - ... hide_gridlines=True, - ... sheet_zoom=125, - ... ) - - ''' - def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: - ''' - Write to Arrow IPC binary stream or Feather file. - - See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. - - Parameters - ---------- - file - Path or writeable file-like object to which the IPC data will be - written. If set to `None`, the output is returned as a BytesIO object. - compression : {\'uncompressed\', \'lz4\', \'zstd\'} - Compression method. Defaults to "uncompressed". - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.arrow" - >>> df.write_ipc(path) - - ''' - def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: - ''' - Write to Arrow IPC record batch stream. - - See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. - - Parameters - ---------- - file - Path or writeable file-like object to which the IPC record batch data will - be written. If set to `None`, the output is returned as a BytesIO object. - compression : {\'uncompressed\', \'lz4\', \'zstd\'} - Compression method. Defaults to "uncompressed". - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.arrow" - >>> df.write_ipc_stream(path) - - ''' - def write_parquet(self, file: str | Path | BytesIO) -> None: - ''' - Write to Apache Parquet file. - - Parameters - ---------- - file - File path or writeable file-like object to which the result will be written. - compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} - Choose "zstd" for good compression performance. - Choose "lz4" for fast compression/decompression. - Choose "snappy" for more backwards compatibility guarantees - when you deal with older parquet readers. - compression_level - The level of compression to use. Higher compression means smaller files on - disk. - - - "gzip" : min-level: 0, max-level: 10. - - "brotli" : min-level: 0, max-level: 11. - - "zstd" : min-level: 1, max-level: 22. - - statistics - Write statistics to the parquet headers. This requires extra compute. - row_group_size - Size of the row groups in number of rows. Defaults to 512^2 rows. - use_pyarrow - Use C++ parquet implementation vs Rust parquet implementation. - At the moment C++ supports more features. - pyarrow_options - Arguments passed to `pyarrow.parquet.write_table`. - - If you pass `partition_cols` here, the dataset will be written - using `pyarrow.parquet.write_to_dataset`. - The `partition_cols` parameter leads to write the dataset to a directory. - Similar to Spark\'s partitioned datasets. - - Examples - -------- - >>> import pathlib - >>> - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> path: pathlib.Path = dirpath / "new_file.parquet" - >>> df.write_parquet(path) - - We can use pyarrow with use_pyarrow_write_to_dataset=True - to write partitioned datasets. The following example will - write the first row to ../watermark=1/*.parquet and the - other rows to ../watermark=2/*.parquet. - - >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) - >>> path: pathlib.Path = dirpath / "partitioned_object" - >>> df.write_parquet( - ... path, - ... use_pyarrow=True, - ... pyarrow_options={"partition_cols": ["watermark"]}, - ... ) - - ''' - def write_database(self, table_name: str, connection: str) -> None: - ''' - Write a polars frame to a database. - - Parameters - ---------- - table_name - Name of the table to create or append to in the target SQL database. - If your table name contains special characters, it should be quoted. - connection - Connection URI string, for example: - - * "postgresql://user:pass@server:port/database" - * "sqlite:////path/to/database.db" - if_exists : {\'append\', \'replace\', \'fail\'} - The insert mode. - \'replace\' will create a new database table, overwriting an existing one. - \'append\' will append to an existing table. - \'fail\' will fail if table already exists. - engine : {\'sqlalchemy\', \'adbc\'} - Select the engine used for writing the data. - ''' - def write_delta(self, target: str | Path | deltalake.DeltaTable) -> None: - ''' - Write DataFrame as delta table. - - Parameters - ---------- - target - URI of a table or a DeltaTable object. - mode : {\'error\', \'append\', \'overwrite\', \'ignore\'} - How to handle existing data. - - * If \'error\', throw an error if the table already exists (default). - * If \'append\', will add new data. - * If \'overwrite\', will replace table with new data. - * If \'ignore\', will not write anything if table already exists. - overwrite_schema - If True, allows updating the schema of the table. - storage_options - Extra options for the storage backends supported by `deltalake`. - For cloud storages, this may include configurations for authentication etc. - - * See a list of supported storage options for S3 `here `__. - * See a list of supported storage options for GCS `here `__. - * See a list of supported storage options for Azure `here `__. - delta_write_options - Additional keyword arguments while writing a Delta lake Table. - See a list of supported write options `here `__. - - Raises - ------ - TypeError - If the DataFrame contains unsupported data types. - ArrowInvalidError - If the DataFrame contains data types that could not be cast to their - primitive type. - - Notes - ----- - The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` - are not supported by the delta protocol specification and will raise a - TypeError. - - Some other data types are not supported but have an associated `primitive type - `__ - to which they can be cast. This affects the following data types: - - - Unsigned integers - - :class:`Datetime` types with millisecond or nanosecond precision or with - time zone information - - :class:`Utf8`, :class:`Binary`, and :class:`List` (\'large\' types) - - Polars columns are always nullable. To write data to a delta table with - non-nullable columns, a custom pyarrow schema has to be passed to the - `delta_write_options`. See the last example below. - - Examples - -------- - Write a dataframe to the local filesystem as a Delta Lake table. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> table_path = "/path/to/delta-table/" - >>> df.write_delta(table_path) # doctest: +SKIP - - Append data to an existing Delta Lake table on the local filesystem. - Note that this will fail if the schema of the new data does not match the - schema of the existing table. - - >>> df.write_delta(table_path, mode="append") # doctest: +SKIP - - Overwrite a Delta Lake table as a new version. - If the schemas of the new and old data are the same, setting - `overwrite_schema` is not required. - - >>> existing_table_path = "/path/to/delta-table/" - >>> df.write_delta( - ... existing_table_path, mode="overwrite", overwrite_schema=True - ... ) # doctest: +SKIP - - Write a dataframe as a Delta Lake table to a cloud object store like S3. - - >>> table_path = "s3://bucket/prefix/to/delta-table/" - >>> df.write_delta( - ... table_path, - ... storage_options={ - ... "AWS_REGION": "THE_AWS_REGION", - ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", - ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", - ... }, - ... ) # doctest: +SKIP - - Write DataFrame as a Delta Lake table with non-nullable columns. - - >>> import pyarrow as pa - >>> existing_table_path = "/path/to/delta-table/" - >>> df.write_delta( - ... existing_table_path, - ... delta_write_options={ - ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) - ... }, - ... ) # doctest: +SKIP - - ''' - def estimated_size(self, unit: SizeUnit = ...) -> int | float: - ''' - Return an estimation of the total (heap) allocated size of the `DataFrame`. - - Estimated size is given in the specified unit (bytes by default). - - This estimation is the sum of the size of its buffers, validity, including - nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the - size of 2 arrays is not the sum of the sizes computed from this function. In - particular, [`StructArray`]\'s size is an upper bound. - - When an array is sliced, its allocated size remains constant because the buffer - unchanged. However, this function will yield a smaller number. This is because - this function returns the visible size of the buffer, not its total capacity. - - FFI buffers are included in this estimation. - - Parameters - ---------- - unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} - Scale the returned size to the given unit. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "x": list(reversed(range(1_000_000))), - ... "y": [v / 1000 for v in range(1_000_000)], - ... "z": [str(v) for v in range(1_000_000)], - ... }, - ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], - ... ) - >>> df.estimated_size() - 25888898 - >>> df.estimated_size("mb") - 24.689577102661133 - - ''' - def transpose(self) -> Self: - ''' - Transpose a DataFrame over the diagonal. - - Parameters - ---------- - include_header - If set, the column names will be added as first column. - header_name - If `include_header` is set, this determines the name of the column that will - be inserted. - column_names - Optional iterable yielding strings or a string naming an existing column. - These will name the value (non-header) columns in the transposed data. - - Notes - ----- - This is a very expensive operation. Perhaps you can do it differently. - - Returns - ------- - DataFrame - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) - >>> df.transpose(include_header=True) - shape: (2, 4) - ┌────────┬──────────┬──────────┬──────────┐ - │ column ┆ column_0 ┆ column_1 ┆ column_2 │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞════════╪══════════╪══════════╪══════════╡ - │ a ┆ 1 ┆ 2 ┆ 3 │ - │ b ┆ 1 ┆ 2 ┆ 3 │ - └────────┴──────────┴──────────┴──────────┘ - - Replace the auto-generated column names with a list - - >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 2 ┆ 3 │ - │ 1 ┆ 2 ┆ 3 │ - └─────┴─────┴─────┘ - - Include the header as a separate column - - >>> df.transpose( - ... include_header=True, header_name="foo", column_names=["a", "b", "c"] - ... ) - shape: (2, 4) - ┌─────┬─────┬─────┬─────┐ - │ foo ┆ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═════╡ - │ a ┆ 1 ┆ 2 ┆ 3 │ - │ b ┆ 1 ┆ 2 ┆ 3 │ - └─────┴─────┴─────┴─────┘ - - Replace the auto-generated column with column names from a generator function - - >>> def name_generator(): - ... base_name = "my_column_" - ... count = 0 - ... while True: - ... yield f"{base_name}{count}" - ... count += 1 - ... - >>> df.transpose(include_header=False, column_names=name_generator()) - shape: (2, 3) - ┌─────────────┬─────────────┬─────────────┐ - │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════════════╪═════════════╪═════════════╡ - │ 1 ┆ 2 ┆ 3 │ - │ 1 ┆ 2 ┆ 3 │ - └─────────────┴─────────────┴─────────────┘ - - Use an existing column as the new column names - - >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) - >>> df.transpose(column_names="id") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 3 ┆ 2 │ - │ 3 ┆ 4 ┆ 6 │ - └─────┴─────┴─────┘ - >>> df.transpose(include_header=True, header_name="new_id", column_names="id") - shape: (2, 4) - ┌────────┬─────┬─────┬─────┐ - │ new_id ┆ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞════════╪═════╪═════╪═════╡ - │ col1 ┆ 1 ┆ 3 ┆ 2 │ - │ col2 ┆ 3 ┆ 4 ┆ 6 │ - └────────┴─────┴─────┴─────┘ - ''' - def reverse(self) -> DataFrame: - ''' - Reverse the DataFrame. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "key": ["a", "b", "c"], - ... "val": [1, 2, 3], - ... } - ... ) - >>> df.reverse() - shape: (3, 2) - ┌─────┬─────┐ - │ key ┆ val │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ c ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 1 │ - └─────┴─────┘ - - ''' - def rename(self, mapping: dict[str, str]) -> DataFrame: - ''' - Rename column names. - - Parameters - ---------- - mapping - Key value pairs that map from old name to new name. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} - ... ) - >>> df.rename({"foo": "apple"}) - shape: (3, 3) - ┌───────┬─────┬─────┐ - │ apple ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═══════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └───────┴─────┴─────┘ - - ''' - def insert_column(self, index: int, column: Series) -> Self: - ''' - Insert a Series at a certain column index. - - This operation is in place. - - Parameters - ---------- - index - Index at which to insert the new `Series` column. - column - `Series` to insert. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> s = pl.Series("baz", [97, 98, 99]) - >>> df.insert_column(1, s) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ baz ┆ bar │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 97 ┆ 4 │ - │ 2 ┆ 98 ┆ 5 │ - │ 3 ┆ 99 ┆ 6 │ - └─────┴─────┴─────┘ - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) - >>> df.insert_column(3, s) - shape: (4, 4) - ┌─────┬──────┬───────┬──────┐ - │ a ┆ b ┆ c ┆ d │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 │ - ╞═════╪══════╪═══════╪══════╡ - │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ - │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ - │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ - │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ - └─────┴──────┴───────┴──────┘ - - ''' - def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: - ''' - Filter the rows in the DataFrame based on a predicate expression. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - predicates - Expression that evaluates to a boolean Series. - constraints - Column filters. Use name=value to filter column name by the supplied value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - - Filter on one condition: - - >>> df.filter(pl.col("foo") > 1) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Filter on multiple conditions, combined with and/or operators: - - >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Provide multiple filters using `*args` syntax: - - >>> df.filter( - ... pl.col("foo") <= 2, - ... ~pl.col("ham").is_in(["b", "c"]), - ... ) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Provide multiple filters using `**kwargs` syntax: - - >>> df.filter(foo=2, ham="b") - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - └─────┴─────┴─────┘ - - ''' - def glimpse(self) -> str | None: - ''' - Return a dense preview of the DataFrame. - - The formatting shows one line per column so that wide dataframes display - cleanly. Each line shows the column name, the data type, and the first - few values. - - Parameters - ---------- - max_items_per_column - Maximum number of items to show per column. - max_colname_length - Maximum length of the displayed column names; values that exceed this - value are truncated with a trailing ellipsis. - return_as_string - If True, return the preview as a string instead of printing to stdout. - - See Also - -------- - describe, head, tail - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... { - ... "a": [1.0, 2.8, 3.0], - ... "b": [4, 5, None], - ... "c": [True, False, True], - ... "d": [None, "b", "c"], - ... "e": ["usd", "eur", None], - ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], - ... } - ... ) - >>> df.glimpse() - Rows: 3 - Columns: 6 - $ a 1.0, 2.8, 3.0 - $ b 4, 5, None - $ c True, False, True - $ d None, \'b\', \'c\' - $ e \'usd\', \'eur\', None - $ f 2020-01-01, 2021-01-02, 2022-01-01 - - ''' - def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: - ''' - Summary statistics for a DataFrame. - - Parameters - ---------- - percentiles - One or more percentiles to include in the summary statistics. - All values must be in the range `[0, 1]`. - - Notes - ----- - The median is included by default as the 50% percentile. - - See Also - -------- - glimpse - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... { - ... "a": [1.0, 2.8, 3.0], - ... "b": [4, 5, None], - ... "c": [True, False, True], - ... "d": [None, "b", "c"], - ... "e": ["usd", "eur", None], - ... "f": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], - ... } - ... ) - >>> df.describe() - shape: (9, 7) - ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┬────────────┐ - │ describe ┆ a ┆ b ┆ c ┆ d ┆ e ┆ f │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ - ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╪════════════╡ - │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ - │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 ┆ 0 │ - │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │ - │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │ - │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │ - │ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ - │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ - │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ - │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │ - └────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘ - - ''' - def get_column_index(self, name: str) -> int: - ''' - Find the index of a column by name. - - Parameters - ---------- - name - Name of the column to find. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} - ... ) - >>> df.get_column_index("ham") - 2 - - ''' - def replace_column(self, index: int, column: Series) -> Self: - ''' - Replace a column at an index location. - - This operation is in place. - - Parameters - ---------- - index - Column index. - column - Series that will replace the column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> s = pl.Series("apple", [10, 20, 30]) - >>> df.replace_column(0, s) - shape: (3, 3) - ┌───────┬─────┬─────┐ - │ apple ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═══════╪═════╪═════╡ - │ 10 ┆ 6 ┆ a │ - │ 20 ┆ 7 ┆ b │ - │ 30 ┆ 8 ┆ c │ - └───────┴─────┴─────┘ - ''' - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: - ''' - Sort the dataframe by the given columns. - - Parameters - ---------- - by - Column(s) to sort by. Accepts expression input. Strings are parsed as column - names. - *more_by - Additional columns to sort by, specified as positional arguments. - descending - Sort in descending order. When sorting by multiple columns, can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - - Examples - -------- - Pass a single column name to sort by that column. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [6.0, 5.0, 4.0], - ... "c": ["a", "c", "b"], - ... } - ... ) - >>> df.sort("a") - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - Sorting by expressions is also supported. - - >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - └──────┴─────┴─────┘ - - Sort by multiple columns by passing a list of columns. - - >>> df.sort(["c", "a"], descending=True) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - └──────┴─────┴─────┘ - - Or use positional arguments to sort by multiple columns in the same way. - - >>> df.sort("c", "a", descending=[False, True]) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - ''' - def top_k(self, k: int) -> DataFrame: - ''' - Return the `k` largest elements. - - If \'descending=True` the smallest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - See Also - -------- - bottom_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 largest values in column b. - - >>> df.top_k(4, by="b") - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ a ┆ 2 │ - │ b ┆ 2 │ - │ b ┆ 1 │ - └─────┴─────┘ - - Get the rows which contain the 4 largest values when sorting on column b and a. - - >>> df.top_k(4, by=["b", "a"]) - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 2 │ - │ c ┆ 1 │ - └─────┴─────┘ - - ''' - def bottom_k(self, k: int) -> DataFrame: - ''' - Return the `k` smallest elements. - - If \'descending=True` the largest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - See Also - -------- - top_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 smallest values in column b. - - >>> df.bottom_k(4, by="b") - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 1 │ - │ a ┆ 1 │ - │ c ┆ 1 │ - │ a ┆ 2 │ - └─────┴─────┘ - - Get the rows which contain the 4 smallest values when sorting on column a and b. - - >>> df.bottom_k(4, by=["a", "b"]) - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ b ┆ 1 │ - │ b ┆ 2 │ - └─────┴─────┘ - - ''' - def equals(self, other: DataFrame) -> bool: - ''' - Check whether the DataFrame is equal to another DataFrame. - - Parameters - ---------- - other - DataFrame to compare with. - null_equal - Consider null values as equal. - - See Also - -------- - assert_frame_equal - - Examples - -------- - >>> df1 = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df2 = pl.DataFrame( - ... { - ... "foo": [3, 2, 1], - ... "bar": [8.0, 7.0, 6.0], - ... "ham": ["c", "b", "a"], - ... } - ... ) - >>> df1.equals(df1) - True - >>> df1.equals(df2) - False - - ''' - def replace(self, column: str, new_column: Series) -> Self: - ''' - Replace a column by a new Series. - - Parameters - ---------- - column - Column to replace. - new_column - New column to insert. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> s = pl.Series([10, 20, 30]) - >>> df.replace("foo", s) # works in-place! # doctest: +SKIP - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 10 ┆ 4 │ - │ 20 ┆ 5 │ - │ 30 ┆ 6 │ - └─────┴─────┘ - - ''' - def slice(self, offset: int, length: int | None = ...) -> Self: - ''' - Get a slice of this DataFrame. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.slice(1, 2) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7.0 ┆ b │ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def head(self, n: int = ...) -> Self: - ''' - Get the first `n` rows. - - Parameters - ---------- - n - Number of rows to return. If a negative value is passed, return all rows - except the last `abs(n)`. - - See Also - -------- - tail, glimpse, slice - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> df.head(3) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Pass a negative value to get all rows `except` the last `abs(n)`. - - >>> df.head(-3) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - └─────┴─────┴─────┘ - - ''' - def tail(self, n: int = ...) -> Self: - ''' - Get the last `n` rows. - - Parameters - ---------- - n - Number of rows to return. If a negative value is passed, return all rows - except the first `abs(n)`. - - See Also - -------- - head, slice - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 4, 5], - ... "bar": [6, 7, 8, 9, 10], - ... "ham": ["a", "b", "c", "d", "e"], - ... } - ... ) - >>> df.tail(3) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8 ┆ c │ - │ 4 ┆ 9 ┆ d │ - │ 5 ┆ 10 ┆ e │ - └─────┴─────┴─────┘ - - Pass a negative value to get all rows `except` the first `abs(n)`. - - >>> df.tail(-3) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 4 ┆ 9 ┆ d │ - │ 5 ┆ 10 ┆ e │ - └─────┴─────┴─────┘ - - ''' - def limit(self, n: int = ...) -> Self: - """ - Get the first `n` rows. - - Alias for :func:`DataFrame.head`. - - Parameters - ---------- - n - Number of rows to return. If a negative value is passed, return all rows - except the last `abs(n)`. - - See Also - -------- - head - - """ - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: - ''' - Drop all rows that contain null values. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - subset - Column name(s) for which null values are considered. - If set to `None` (default), use all columns. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, None, 8], - ... "ham": ["a", "b", None], - ... } - ... ) - - The default behavior of this method is to drop rows where any single - value of the row is null. - - >>> df.drop_nulls() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - This behaviour can be constrained to consider only a subset of columns, as - defined by name or with a selector. For example, dropping rows if there is - a null in any of the integer columns: - - >>> import polars.selectors as cs - >>> df.drop_nulls(subset=cs.integer()) - shape: (2, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ null │ - └─────┴─────┴──────┘ - - Below are some additional examples that show how to drop null - values based on other conditions. - - >>> df = pl.DataFrame( - ... { - ... "a": [None, None, None, None], - ... "b": [1, 2, None, 1], - ... "c": [1, None, None, 1], - ... } - ... ) - >>> df - shape: (4, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪══════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ null ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴──────┴──────┘ - - Drop a row only if all values are null: - - >>> df.filter(~pl.all_horizontal(pl.all().is_null())) - shape: (3, 3) - ┌──────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪═════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴─────┴──────┘ - - Drop a column if all values are null: - - >>> df[[s.name for s in df if not (s.null_count() == df.height)]] - shape: (4, 2) - ┌──────┬──────┐ - │ b ┆ c │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ 1 ┆ 1 │ - │ 2 ┆ null │ - │ null ┆ null │ - │ 1 ┆ 1 │ - └──────┴──────┘ - - ''' - def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: - ''' - Offers a structured way to apply a sequence of user-defined functions (UDFs). - - Parameters - ---------- - function - Callable; will receive the frame as the first parameter, - followed by any given args/kwargs. - *args - Arguments to pass to the UDF. - **kwargs - Keyword arguments to pass to the UDF. - - Notes - ----- - It is recommended to use LazyFrame when piping operations, in order - to fully take advantage of query optimization and parallelization. - See :meth:`df.lazy() `. - - Examples - -------- - >>> def cast_str_to_int(data, col_name): - ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) - ... - >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) - >>> df.pipe(cast_str_to_int, col_name="b") - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 10 │ - │ 2 ┆ 20 │ - │ 3 ┆ 30 │ - │ 4 ┆ 40 │ - └─────┴─────┘ - - >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) - >>> df - shape: (2, 2) - ┌─────┬─────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - └─────┴─────┘ - >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 1 │ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: - ''' - Add a column at index 0 that counts the rows. - - Parameters - ---------- - name - Name of the column to add. - offset - Start the row count at this offset. Default = 0 - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> df.with_row_count() - shape: (3, 3) - ┌────────┬─────┬─────┐ - │ row_nr ┆ a ┆ b │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ i64 ┆ i64 │ - ╞════════╪═════╪═════╡ - │ 0 ┆ 1 ┆ 2 │ - │ 1 ┆ 3 ┆ 4 │ - │ 2 ┆ 5 ┆ 6 │ - └────────┴─────┴─────┘ - - ''' - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: - ''' - Start a group by operation. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - .. note:: - Within each group, the order of rows is always preserved, regardless - of this argument. - - Returns - ------- - GroupBy - Object which can be used to perform aggregations. - - Examples - -------- - Group by one column and call `agg` to compute the grouped sum of another - column. - - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "c"], - ... "b": [1, 2, 1, 3, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 2 │ - │ b ┆ 5 │ - │ c ┆ 3 │ - └─────┴─────┘ - - Set `maintain_order=True` to ensure the order of the groups is consistent with - the input. - - >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) - shape: (3, 2) - ┌─────┬───────────┐ - │ a ┆ c │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════╪═══════════╡ - │ a ┆ [5, 3] │ - │ b ┆ [4, 2] │ - │ c ┆ [1] │ - └─────┴───────────┘ - - Group by multiple columns by passing a list of column names. - - >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - - Or use positional arguments to group by multiple columns in the same way. - Expressions are also accepted. - - >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ f64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 0 ┆ 4.0 │ - │ b ┆ 1 ┆ 3.0 │ - │ c ┆ 1 ┆ 1.0 │ - └─────┴─────┴─────┘ - - The `GroupBy` object returned by this method is iterable, returning the name - and data of each group. - - >>> for name, data in df.group_by("a"): # doctest: +SKIP - ... print(name) - ... print(data) - ... - a - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘ - b - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘ - c - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - - ''' - def rolling(self, index_column: IntoExpr) -> RollingGroupBy: - ''' - Create rolling groups based on a time, Int32, or Int64 column. - - Different from a `group_by_dynamic` the windows are now determined by the - individual values and are not of constant intervals. For constant intervals use - :func:`DataFrame.group_by_dynamic`. - - If you have a time series ``, then by default the - windows created will be - - * (t_0 - period, t_0] - * (t_1 - period, t_1] - * ... - * (t_n - period, t_n] - - whereas if you pass a non-default `offset`, then the windows will be - - * (t_0 + offset, t_0 + offset + period] - * (t_1 + offset, t_1 + offset + period] - * ... - * (t_n + offset, t_n + offset + period] - - The `period` and `offset` arguments are created either from a timedelta, or - by using the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a rolling operation on an integer column, the windows are defined by: - - - **"1i" # length 1** - - **"10i" # length 10** - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling operation on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - RollingGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - group_by_dynamic - - Examples - -------- - >>> dates = [ - ... "2020-01-01 13:45:48", - ... "2020-01-01 16:42:13", - ... "2020-01-01 16:45:09", - ... "2020-01-02 18:12:48", - ... "2020-01-03 19:45:32", - ... "2020-01-08 23:16:43", - ... ] - >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( - ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() - ... ) - >>> out = df.rolling(index_column="dt", period="2d").agg( - ... [ - ... pl.sum("a").alias("sum_a"), - ... pl.min("a").alias("min_a"), - ... pl.max("a").alias("max_a"), - ... ] - ... ) - >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] - >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] - >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] - >>> out - shape: (6, 4) - ┌─────────────────────┬───────┬───────┬───────┐ - │ dt ┆ sum_a ┆ min_a ┆ max_a │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞═════════════════════╪═══════╪═══════╪═══════╡ - │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ - │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ - │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ - │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ - │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ - │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ - └─────────────────────┴───────┴───────┴───────┘ - - ''' - def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - Time windows are calculated and rows are assigned to windows. Different from a - normal group by is that a row can be member of multiple groups. - By default, the windows look like: - - - [start, start + period) - - [start + every, start + every + period) - - [start + 2*every, start + 2*every + period) - - ... - - where `start` is determined by `start_by`, `offset`, and `every` (see parameter - descriptions below). - - .. warning:: - The index column must be sorted in ascending order. If `by` is passed, then - the index column must be sorted in ascending order within each group. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - - .. deprecated:: 0.19.4 - Use `label` instead. - include_boundaries - Add the lower and upper bound of the window to the "_lower_boundary" and - "_upper_boundary" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - label : {\'left\', \'right\', \'datapoint\'} - Define which label to use for the window: - - - \'left\': lower boundary of the window - - \'right\': upper boundary of the window - - \'datapoint\': the first value of the index column in the given window. - If you don\'t need the label to be at one of the boundaries, choose this - option for maximum performance - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - DynamicGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - rolling - - Notes - ----- - 1) If you\'re coming from pandas, then - - .. code-block:: python - - # polars - df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) - - is equivalent to - - .. code-block:: python - - # pandas - df.set_index("ts").resample("D")["value"].sum().reset_index() - - though note that, unlike pandas, polars doesn\'t add extra rows for empty - windows. If you need `index_column` to be evenly spaced, then please combine - with :func:`DataFrame.upsample`. - - 2) The `every`, `period` and `offset` arguments are created with - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a group_by_dynamic on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Examples - -------- - >>> from datetime import datetime - >>> df = pl.DataFrame( - ... { - ... "time": pl.datetime_range( - ... start=datetime(2021, 12, 16), - ... end=datetime(2021, 12, 16, 3), - ... interval="30m", - ... eager=True, - ... ), - ... "n": range(7), - ... } - ... ) - >>> df - shape: (7, 2) - ┌─────────────────────┬─────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i64 │ - ╞═════════════════════╪═════╡ - │ 2021-12-16 00:00:00 ┆ 0 │ - │ 2021-12-16 00:30:00 ┆ 1 │ - │ 2021-12-16 01:00:00 ┆ 2 │ - │ 2021-12-16 01:30:00 ┆ 3 │ - │ 2021-12-16 02:00:00 ┆ 4 │ - │ 2021-12-16 02:30:00 ┆ 5 │ - │ 2021-12-16 03:00:00 ┆ 6 │ - └─────────────────────┴─────┘ - - Group by windows of 1 hour starting at 2021-12-16 00:00:00. - - >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [1, 2] │ - │ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ 2021-12-16 02:00:00 ┆ [5, 6] │ - └─────────────────────┴───────────┘ - - The window boundaries can also be added to the aggregation result - - >>> df.group_by_dynamic( - ... "time", every="1h", include_boundaries=True, closed="right" - ... ).agg(pl.col("n").mean()) - shape: (4, 4) - ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ - │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ - ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ - │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ - │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ - │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ - │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ - └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ - - When closed="left", the window excludes the right end of interval: - [lower_bound, upper_bound) - - >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-16 00:00:00 ┆ [0, 1] │ - │ 2021-12-16 01:00:00 ┆ [2, 3] │ - │ 2021-12-16 02:00:00 ┆ [4, 5] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - When closed="both" the time values at the window boundaries belong to 2 groups. - - >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) - shape: (5, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ - │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - Dynamic group bys can also be combined with grouping on normal keys - - >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) - >>> df - shape: (7, 3) - ┌─────────────────────┬─────┬────────┐ - │ time ┆ n ┆ groups │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ str │ - ╞═════════════════════╪═════╪════════╡ - │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ - │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ - │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ - │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ - │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ - │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ - │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ - └─────────────────────┴─────┴────────┘ - >>> df.group_by_dynamic( - ... "time", - ... every="1h", - ... closed="both", - ... by="groups", - ... include_boundaries=True, - ... ).agg(pl.col("n")) - shape: (7, 5) - ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ - │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ - ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ - │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ - │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ - │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ - │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ - │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ - └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ - - Dynamic group by on an index column - - >>> df = pl.DataFrame( - ... { - ... "idx": pl.int_range(0, 6, eager=True), - ... "A": ["A", "A", "B", "B", "B", "C"], - ... } - ... ) - >>> ( - ... df.group_by_dynamic( - ... "idx", - ... every="2i", - ... period="3i", - ... include_boundaries=True, - ... closed="right", - ... ).agg(pl.col("A").alias("A_agg_list")) - ... ) - shape: (4, 4) - ┌─────────────────┬─────────────────┬─────┬─────────────────┐ - │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 ┆ list[str] │ - ╞═════════════════╪═════════════════╪═════╪═════════════════╡ - │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ - │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ - │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ - │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ - └─────────────────┴─────────────────┴─────┴─────────────────┘ - - ''' - def upsample(self, time_column: str) -> Self: - ''' - Upsample a DataFrame at a regular frequency. - - The `every` and `offset` arguments are created with - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - - - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - Parameters - ---------- - time_column - time column will be used to determine a date_range. - Note that this column has to be sorted for the output to make sense. - every - interval will start \'every\' duration - offset - change the start of the date_range by this offset. - by - First group by these columns and then upsample for every group - maintain_order - Keep the ordering predictable. This is slower. - - Returns - ------- - DataFrame - Result will be sorted by `time_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - Examples - -------- - Upsample a DataFrame by a certain interval. - - >>> from datetime import datetime - >>> df = pl.DataFrame( - ... { - ... "time": [ - ... datetime(2021, 2, 1), - ... datetime(2021, 4, 1), - ... datetime(2021, 5, 1), - ... datetime(2021, 6, 1), - ... ], - ... "groups": ["A", "B", "A", "B"], - ... "values": [0, 1, 2, 3], - ... } - ... ).set_sorted("time") - >>> df.upsample( - ... time_column="time", every="1mo", by="groups", maintain_order=True - ... ).select(pl.all().forward_fill()) - shape: (7, 3) - ┌─────────────────────┬────────┬────────┐ - │ time ┆ groups ┆ values │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ str ┆ i64 │ - ╞═════════════════════╪════════╪════════╡ - │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ - │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ - │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ - │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ - │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ - │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ - │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ - └─────────────────────┴────────┴────────┘ - - ''' - def join_asof(self, other: DataFrame) -> DataFrame: - ''' - Perform an asof join. - - This is similar to a left-join except that we match on nearest key rather than - equal keys. - - Both DataFrames must be sorted by the asof_join key. - - For each row in the left DataFrame: - - - A "backward" search selects the last row in the right DataFrame whose - \'on\' key is less than or equal to the left\'s key. - - - A "forward" search selects the first row in the right DataFrame whose - \'on\' key is greater than or equal to the left\'s key. - - - A "nearest" search selects the last row in the right DataFrame whose value - is nearest to the left\'s key. String keys are not currently supported for a - nearest search. - - The default is "backward". - - Parameters - ---------- - other - Lazy DataFrame to join with. - left_on - Join column of the left DataFrame. - right_on - Join column of the right DataFrame. - on - Join column of both DataFrames. If set, `left_on` and `right_on` should be - None. - by - join on these columns before doing asof join - by_left - join on these columns before doing asof join - by_right - join on these columns before doing asof join - strategy : {\'backward\', \'forward\', \'nearest\'} - Join strategy. - suffix - Suffix to append to columns with a duplicate name. - tolerance - Numeric tolerance. By setting this the join will only be done if the near - keys are within this distance. If an asof join is done on columns of dtype - "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta - object or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - allow_parallel - Allow the physical plan to optionally evaluate the computation of both - DataFrames up to the join in parallel. - force_parallel - Force the physical plan to evaluate the computation of both DataFrames up to - the join in parallel. - - Examples - -------- - >>> from datetime import datetime - >>> gdp = pl.DataFrame( - ... { - ... "date": [ - ... datetime(2016, 1, 1), - ... datetime(2017, 1, 1), - ... datetime(2018, 1, 1), - ... datetime(2019, 1, 1), - ... ], # note record date: Jan 1st (sorted!) - ... "gdp": [4164, 4411, 4566, 4696], - ... } - ... ).set_sorted("date") - >>> population = pl.DataFrame( - ... { - ... "date": [ - ... datetime(2016, 5, 12), - ... datetime(2017, 5, 12), - ... datetime(2018, 5, 12), - ... datetime(2019, 5, 12), - ... ], # note record date: May 12th (sorted!) - ... "population": [82.19, 82.66, 83.12, 83.52], - ... } - ... ).set_sorted("date") - >>> population.join_asof(gdp, on="date", strategy="backward") - shape: (4, 3) - ┌─────────────────────┬────────────┬──────┐ - │ date ┆ population ┆ gdp │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ f64 ┆ i64 │ - ╞═════════════════════╪════════════╪══════╡ - │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ - │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ - │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ - │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ - └─────────────────────┴────────────┴──────┘ - - ''' - def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: - ''' - Join in SQL-like fashion. - - Parameters - ---------- - other - DataFrame to join with. - on - Name(s) of the join columns in both DataFrames. - how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} - Join strategy. - - .. note:: - A left join preserves the row order of the left DataFrame. - left_on - Name(s) of the left join column(s). - right_on - Name(s) of the right join column(s). - suffix - Suffix to append to columns with a duplicate name. - validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} - Checks if join is of specified type. - - * *many_to_many* - “m:m”: default, does not result in checks - * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets - * *one_to_many* - “1:m”: check if join keys are unique in left dataset - * *many_to_one* - “m:1”: check if join keys are unique in right dataset - - .. note:: - - - This is currently not supported the streaming engine. - - This is only supported when joined by single columns. - - Returns - ------- - DataFrame - - See Also - -------- - join_asof - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> other_df = pl.DataFrame( - ... { - ... "apple": ["x", "y", "z"], - ... "ham": ["a", "b", "d"], - ... } - ... ) - >>> df.join(other_df, on="ham") - shape: (2, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - └─────┴─────┴─────┴───────┘ - - >>> df.join(other_df, on="ham", how="outer") - shape: (4, 4) - ┌──────┬──────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞══════╪══════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ null ┆ null ┆ d ┆ z │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └──────┴──────┴─────┴───────┘ - - >>> df.join(other_df, on="ham", how="left") - shape: (3, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └─────┴─────┴─────┴───────┘ - - >>> df.join(other_df, on="ham", how="semi") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 7.0 ┆ b │ - └─────┴─────┴─────┘ - - >>> df.join(other_df, on="ham", how="anti") - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - Notes - ----- - For joining on columns with categorical data, see `pl.StringCache()`. - - ''' - def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: - ''' - Apply a custom/user-defined function (UDF) over the rows of the DataFrame. - - .. warning:: - This method is much slower than the native expressions API. - Only use it if you cannot implement your logic otherwise. - - The UDF will receive each row as a tuple of values: `udf(row)`. - - Implementing logic using a Python function is almost always *significantly* - slower and more memory intensive than implementing the same logic using - the native expression API because: - - - The native expression engine runs in Rust; UDFs run in Python. - - Use of Python UDFs forces the DataFrame to be materialized in memory. - - Polars-native expressions can be parallelised (UDFs typically cannot). - - Polars-native expressions can be logically optimised (UDFs cannot). - - Wherever possible you should strongly prefer the native expression API - to achieve the best performance. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output type of the operation. If none given, Polars tries to infer the type. - inference_size - Only used in the case when the custom function returns rows. - This uses the first `n` rows to determine the output schema. - - Notes - ----- - * The frame-level `apply` cannot track column names (as the UDF is a black-box - that may arbitrarily drop, rearrange, transform, or add new columns); if you - want to apply a UDF such that column names are preserved, you should use the - expression-level `apply` syntax instead. - - * If your function is expensive and you don\'t want it to be called more than - once for a given input, consider applying an `@lru_cache` decorator to it. - If your data is suitable you may achieve *significant* speedups. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) - - Return a DataFrame by mapping each row to a tuple: - - >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) - shape: (3, 2) - ┌──────────┬──────────┐ - │ column_0 ┆ column_1 │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════════╪══════════╡ - │ 2 ┆ -3 │ - │ 4 ┆ 15 │ - │ 6 ┆ 24 │ - └──────────┴──────────┘ - - However, it is much better to implement this with a native expression: - - >>> df.select( - ... pl.col("foo") * 2, - ... pl.col("bar") * 3, - ... ) # doctest: +IGNORE_RESULT - - Return a DataFrame with a single column by mapping each row to a scalar: - - >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP - shape: (3, 1) - ┌───────┐ - │ apply │ - │ --- │ - │ i64 │ - ╞═══════╡ - │ 1 │ - │ 9 │ - │ 14 │ - └───────┘ - - In this case it is better to use the following native expression: - - >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT - - ''' - def hstack(self, columns: list[Series] | DataFrame) -> Self: - ''' - Return a new DataFrame grown horizontally by stacking multiple Series to it. - - Parameters - ---------- - columns - Series to stack. - in_place - Modify in place. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> x = pl.Series("apple", [10, 20, 30]) - >>> df.hstack([x]) - shape: (3, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6 ┆ a ┆ 10 │ - │ 2 ┆ 7 ┆ b ┆ 20 │ - │ 3 ┆ 8 ┆ c ┆ 30 │ - └─────┴─────┴─────┴───────┘ - - ''' - def vstack(self, other: DataFrame) -> Self: - ''' - Grow this DataFrame vertically by stacking a DataFrame to it. - - Parameters - ---------- - other - DataFrame to stack. - in_place - Modify in place. - - See Also - -------- - extend - - Examples - -------- - >>> df1 = pl.DataFrame( - ... { - ... "foo": [1, 2], - ... "bar": [6, 7], - ... "ham": ["a", "b"], - ... } - ... ) - >>> df2 = pl.DataFrame( - ... { - ... "foo": [3, 4], - ... "bar": [8, 9], - ... "ham": ["c", "d"], - ... } - ... ) - >>> df1.vstack(df2) - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - │ 4 ┆ 9 ┆ d │ - └─────┴─────┴─────┘ - - ''' - def extend(self, other: DataFrame) -> Self: - ''' - Extend the memory backed by this `DataFrame` with the values from `other`. - - Different from `vstack` which adds the chunks from `other` to the chunks of - this `DataFrame`, `extend` appends the data from `other` to the underlying - memory locations and thus may cause a reallocation. - - If this does not cause a reallocation, the resulting data structure will not - have any extra chunks and thus will yield faster queries. - - Prefer `extend` over `vstack` when you want to do a query after a single - append. For instance, during online operations where you add `n` rows and rerun - a query. - - Prefer `vstack` over `extend` when you want to append many times before - doing a query. For instance, when you read in multiple files and want to store - them in a single `DataFrame`. In the latter case, finish the sequence of - `vstack` operations with a `rechunk`. - - Parameters - ---------- - other - DataFrame to vertically add. - - Warnings - -------- - This method modifies the dataframe in-place. The dataframe is returned for - convenience only. - - See Also - -------- - vstack - - Examples - -------- - >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) - >>> df1.extend(df2) - shape: (6, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 5 │ - │ 3 ┆ 6 │ - │ 10 ┆ 40 │ - │ 20 ┆ 50 │ - │ 30 ┆ 60 │ - └─────┴─────┘ - - ''' - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: - ''' - Remove columns from the dataframe. - - Parameters - ---------- - columns - Names of the columns that should be removed from the dataframe, or - a selector that determines the columns to drop. - *more_columns - Additional columns to drop, specified as positional arguments. - - Examples - -------- - Drop a single column by passing the name of that column. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.drop("ham") - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪═════╡ - │ 1 ┆ 6.0 │ - │ 2 ┆ 7.0 │ - │ 3 ┆ 8.0 │ - └─────┴─────┘ - - Drop multiple columns by passing a list of column names. - - >>> df.drop(["bar", "ham"]) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - Drop multiple columns by passing a selector. - - >>> import polars.selectors as cs - >>> df.drop(cs.numeric()) - shape: (3, 1) - ┌─────┐ - │ ham │ - │ --- │ - │ str │ - ╞═════╡ - │ a │ - │ b │ - │ c │ - └─────┘ - - Use positional arguments to drop multiple columns. - - >>> df.drop("foo", "ham") - shape: (3, 1) - ┌─────┐ - │ bar │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 6.0 │ - │ 7.0 │ - │ 8.0 │ - └─────┘ - - ''' - def drop_in_place(self, name: str) -> Series: - ''' - Drop a single column in-place and return the dropped column. - - Parameters - ---------- - name - Name of the column to drop. - - Returns - ------- - Series - The dropped column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.drop_in_place("ham") - shape: (3,) - Series: \'ham\' [str] - [ - "a" - "b" - "c" - ] - - ''' - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: - ''' - Cast DataFrame column(s) to the specified dtype(s). - - Parameters - ---------- - dtypes - Mapping of column names (or selector) to dtypes, or a single dtype - to which all columns will be cast. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], - ... } - ... ) - - Cast specific frame columns to the specified dtypes: - - >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ u8 ┆ date │ - ╞═════╪═════╪════════════╡ - │ 1.0 ┆ 6 ┆ 2020-01-02 │ - │ 2.0 ┆ 7 ┆ 2021-03-04 │ - │ 3.0 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - Cast all frame columns to the specified dtype: - - >>> df.cast(pl.Utf8).to_dict(as_series=False) - {\'foo\': [\'1\', \'2\', \'3\'], - \'bar\': [\'6.0\', \'7.0\', \'8.0\'], - \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} - - Use selectors to define the columns being cast: - - >>> import polars.selectors as cs - >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}) - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ str │ - ╞═════╪═════╪════════════╡ - │ 1 ┆ 6 ┆ 2020-01-02 │ - │ 2 ┆ 7 ┆ 2021-03-04 │ - │ 3 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - ''' - def clear(self, n: int = ...) -> Self: - ''' - Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. - - Returns a `n`-row null-filled DataFrame with an identical schema. - `n` can be greater than the current number of rows in the DataFrame. - - Parameters - ---------- - n - Number of (null-filled) rows to return in the cleared frame. - - See Also - -------- - clone : Cheap deepcopy/clone. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> df.clear() - shape: (0, 3) - ┌─────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞═════╪═════╪══════╡ - └─────┴─────┴──────┘ - - >>> df.clear(n=2) - shape: (2, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪══════╪══════╡ - │ null ┆ null ┆ null │ - │ null ┆ null ┆ null │ - └──────┴──────┴──────┘ - - ''' - def clone(self) -> Self: - ''' - Create a copy of this DataFrame. - - This is a cheap operation that does not copy data. - - See Also - -------- - clear : Create an empty copy of the current DataFrame, with identical - schema but no data. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.clone() - shape: (4, 3) - ┌─────┬──────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true │ - │ 2 ┆ 4.0 ┆ true │ - │ 3 ┆ 10.0 ┆ false │ - │ 4 ┆ 13.0 ┆ true │ - └─────┴──────┴───────┘ - - ''' - def get_columns(self) -> list[Series]: - ''' - Get the DataFrame as a List of Series. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.get_columns() - [shape: (3,) - Series: \'foo\' [i64] - [ - 1 - 2 - 3 - ], shape: (3,) - Series: \'bar\' [i64] - [ - 4 - 5 - 6 - ]] - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.get_columns() - [shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - ], shape: (4,) - Series: \'b\' [f64] - [ - 0.5 - 4.0 - 10.0 - 13.0 - ], shape: (4,) - Series: \'c\' [bool] - [ - true - true - false - true - ]] - - ''' - def get_column(self, name: str) -> Series: - ''' - Get a single column by name. - - Parameters - ---------- - name : str - Name of the column to retrieve. - - Returns - ------- - Series - - See Also - -------- - to_series - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.get_column("foo") - shape: (3,) - Series: \'foo\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: - ''' - Fill null values using the specified value or strategy. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - matches_supertype - Fill all matching supertype of the fill `value`. - - Returns - ------- - DataFrame - DataFrame with None values replaced by the filling strategy. - - See Also - -------- - fill_nan - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 4], - ... "b": [0.5, 4, None, 13], - ... } - ... ) - >>> df.fill_null(99) - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 99 ┆ 99.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - >>> df.fill_null(strategy="forward") - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> df.fill_null(strategy="max") - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> df.fill_null(strategy="zero") - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 0 ┆ 0.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - ''' - def fill_nan(self, value: Expr | int | float | None) -> DataFrame: - ''' - Fill floating point NaN values by an Expression evaluation. - - Parameters - ---------- - value - Value with which to replace NaN values. - - Returns - ------- - DataFrame - DataFrame with NaN values replaced by the given value. - - Warnings - -------- - Note that floating point NaNs (Not a Number) are not missing values! - To replace missing values, use :func:`fill_null`. - - See Also - -------- - fill_null - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1.5, 2, float("nan"), 4], - ... "b": [0.5, 4, float("nan"), 13], - ... } - ... ) - >>> df.fill_nan(99) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪══════╡ - │ 1.5 ┆ 0.5 │ - │ 2.0 ┆ 4.0 │ - │ 99.0 ┆ 99.0 │ - │ 4.0 ┆ 13.0 │ - └──────┴──────┘ - - ''' - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: - ''' - Explode the dataframe to long format by exploding the given columns. - - Parameters - ---------- - columns - Column names, expressions, or a selector defining them. The underlying - columns being exploded must be of List or Utf8 datatype. - *more_columns - Additional names of columns to explode, specified as positional arguments. - - Returns - ------- - DataFrame - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "letters": ["a", "a", "b", "c"], - ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], - ... } - ... ) - >>> df - shape: (4, 2) - ┌─────────┬───────────┐ - │ letters ┆ numbers │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════════╪═══════════╡ - │ a ┆ [1] │ - │ a ┆ [2, 3] │ - │ b ┆ [4, 5] │ - │ c ┆ [6, 7, 8] │ - └─────────┴───────────┘ - >>> df.explode("numbers") - shape: (8, 2) - ┌─────────┬─────────┐ - │ letters ┆ numbers │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════════╪═════════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ a ┆ 3 │ - │ b ┆ 4 │ - │ b ┆ 5 │ - │ c ┆ 6 │ - │ c ┆ 7 │ - │ c ┆ 8 │ - └─────────┴─────────┘ - - ''' - def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: - ''' - Create a spreadsheet-style pivot table as a DataFrame. - - Only available in eager mode. See "Examples" section below for how to do a - "lazy pivot" if you know the unique column values in advance. - - Parameters - ---------- - values - Column values to aggregate. Can be multiple columns if the *columns* - arguments contains multiple columns as well. - index - One or multiple keys to group by. - columns - Name of the column(s) whose values will be used as the header of the output - DataFrame. - aggregate_function - Choose from: - - - None: no aggregation takes place, will raise error if multiple values are in group. - - A predefined aggregate function string, one of - {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} - - An expression to do the aggregation. - - maintain_order - Sort the grouped keys so that the output order is predictable. - sort_columns - Sort the transposed columns by name. Default is by order of discovery. - separator - Used as separator/delimiter in generated column names. - - Returns - ------- - DataFrame - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": ["one", "one", "two", "two", "one", "two"], - ... "bar": ["y", "y", "y", "x", "x", "x"], - ... "baz": [1, 2, 3, 4, 5, 6], - ... } - ... ) - >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ y ┆ x │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ one ┆ 3 ┆ 5 │ - │ two ┆ 3 ┆ 10 │ - └─────┴─────┴─────┘ - - Pivot using selectors to determine the index/values/columns: - - >>> import polars.selectors as cs - >>> df.pivot( - ... values=cs.numeric(), - ... index=cs.string(), - ... columns=cs.string(), - ... aggregate_function="sum", - ... sort_columns=True, - ... ).sort( - ... by=cs.string(), - ... ) - shape: (4, 6) - ┌─────┬─────┬──────┬──────┬──────┬──────┐ - │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪══════╪══════╪══════╪══════╡ - │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ - │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ - │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ - │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ - └─────┴─────┴──────┴──────┴──────┴──────┘ - - Run an expression as aggregation function - - >>> df = pl.DataFrame( - ... { - ... "col1": ["a", "a", "a", "b", "b", "b"], - ... "col2": ["x", "x", "x", "x", "y", "y"], - ... "col3": [6, 7, 3, 2, 5, 7], - ... } - ... ) - >>> df.pivot( - ... index="col1", - ... columns="col2", - ... values="col3", - ... aggregate_function=pl.element().tanh().mean(), - ... ) - shape: (2, 3) - ┌──────┬──────────┬──────────┐ - │ col1 ┆ x ┆ y │ - │ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 │ - ╞══════╪══════════╪══════════╡ - │ a ┆ 0.998347 ┆ null │ - │ b ┆ 0.964028 ┆ 0.999954 │ - └──────┴──────────┴──────────┘ - - Note that `pivot` is only available in eager mode. If you know the unique - column values in advance, you can use :meth:`polars.LazyFrame.groupby` to - get the same result as above in lazy mode: - - >>> index = pl.col("col1") - >>> columns = pl.col("col2") - >>> values = pl.col("col3") - >>> unique_column_values = ["x", "y"] - >>> aggregate_function = lambda col: col.tanh().mean() - >>> ( - ... df.lazy() - ... .group_by(index) - ... .agg( - ... *[ - ... aggregate_function(values.filter(columns == value)).alias(value) - ... for value in unique_column_values - ... ] - ... ) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - shape: (2, 3) - ┌──────┬──────────┬──────────┐ - │ col1 ┆ x ┆ y │ - │ --- ┆ --- ┆ --- │ - │ str ┆ f64 ┆ f64 │ - ╞══════╪══════════╪══════════╡ - │ a ┆ 0.998347 ┆ null │ - │ b ┆ 0.964028 ┆ 0.999954 │ - └──────┴──────────┴──────────┘ - - ''' - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: - ''' - Unpivot a DataFrame from wide to long format. - - Optionally leaves identifiers set. - - This function is useful to massage a DataFrame into a format where one or more - columns are identifier variables (id_vars) while all other columns, considered - measured variables (value_vars), are "unpivoted" to the row axis leaving just - two non-identifier columns, \'variable\' and \'value\'. - - Parameters - ---------- - id_vars - Column(s) or selector(s) to use as identifier variables. - value_vars - Column(s) or selector(s) to use as values variables; if `value_vars` - is empty all columns that are not in `id_vars` will be used. - variable_name - Name to give to the `variable` column. Defaults to "variable" - value_name - Name to give to the `value` column. Defaults to "value" - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["x", "y", "z"], - ... "b": [1, 3, 5], - ... "c": [2, 4, 6], - ... } - ... ) - >>> import polars.selectors as cs - >>> df.melt(id_vars="a", value_vars=cs.numeric()) - shape: (6, 3) - ┌─────┬──────────┬───────┐ - │ a ┆ variable ┆ value │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 │ - ╞═════╪══════════╪═══════╡ - │ x ┆ b ┆ 1 │ - │ y ┆ b ┆ 3 │ - │ z ┆ b ┆ 5 │ - │ x ┆ c ┆ 2 │ - │ y ┆ c ┆ 4 │ - │ z ┆ c ┆ 6 │ - └─────┴──────────┴───────┘ - - ''' - def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: - ''' - Unstack a long table to a wide form without doing an aggregation. - - This can be much faster than a pivot, because it can skip the grouping phase. - - Warnings - -------- - This functionality is experimental and may be subject to changes - without it being considered a breaking change. - - Parameters - ---------- - step - Number of rows in the unstacked frame. - how : { \'vertical\', \'horizontal\' } - Direction of the unstack. - columns - Column name(s) or selector(s) to include in the operation. - If set to `None` (default), use all columns. - fill_values - Fill values that don\'t fit the new size with this value. - - Examples - -------- - >>> from string import ascii_uppercase - >>> df = pl.DataFrame( - ... { - ... "x": list(ascii_uppercase[0:8]), - ... "y": pl.int_range(1, 9, eager=True), - ... } - ... ).with_columns( - ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), - ... ) - >>> df - shape: (8, 3) - ┌─────┬─────┬──────────┐ - │ x ┆ y ┆ z │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ list[u8] │ - ╞═════╪═════╪══════════╡ - │ A ┆ 1 ┆ [1, 2] │ - │ B ┆ 2 ┆ [2, 3] │ - │ C ┆ 3 ┆ [3, 4] │ - │ D ┆ 4 ┆ [4, 5] │ - │ E ┆ 5 ┆ [5, 6] │ - │ F ┆ 6 ┆ [6, 7] │ - │ G ┆ 7 ┆ [7, 8] │ - │ H ┆ 8 ┆ [8, 9] │ - └─────┴─────┴──────────┘ - >>> df.unstack(step=4, how="vertical") - shape: (4, 6) - ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ - │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ - ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ - │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ - │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ - │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ - │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ - └─────┴─────┴─────┴─────┴──────────┴──────────┘ - >>> df.unstack(step=2, how="horizontal") - shape: (4, 6) - ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ - │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ - ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ - │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ - │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ - │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ - │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ - └─────┴─────┴─────┴─────┴──────────┴──────────┘ - >>> import polars.selectors as cs - >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) - shape: (5, 2) - ┌─────┬─────┐ - │ y_0 ┆ y_1 │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 6 │ - │ 2 ┆ 7 │ - │ 3 ┆ 8 │ - │ 4 ┆ 0 │ - │ 5 ┆ 0 │ - └─────┴─────┘ - - ''' - def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: - ''' - Group by the given columns and return the groups as separate dataframes. - - Parameters - ---------- - by - Column name(s) or selector(s) to group by. - *more_by - Additional names of columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default partition by operation. - include_key - Include the columns used to partition the DataFrame in the output. - as_dict - Return a dictionary instead of a list. The dictionary keys are the distinct - group values that identify that group. - - Examples - -------- - Pass a single column name to partition by that column. - - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "b", "a", "b", "c"], - ... "b": [1, 2, 1, 3, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> df.partition_by("a") # doctest: +IGNORE_RESULT - [shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘, - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘] - - Partition by multiple columns by either passing a list of column names, or by - specifying each column name as a positional argument. - - >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT - [shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘, - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘] - - Return the partitions as a dictionary by specifying `as_dict=True`. - - >>> import polars.selectors as cs - >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT - {\'a\': shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 1 ┆ 3 │ - └─────┴─────┴─────┘, - \'b\': shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - └─────┴─────┴─────┘, - \'c\': shape: (1, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘} - - ''' - def shift(self, n: int = ...) -> DataFrame: - ''' - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. Accepts expression input. - Non-expression inputs are parsed as literals. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [5, 6, 7, 8], - ... } - ... ) - >>> df.shift() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ null ┆ null │ - │ 1 ┆ 5 │ - │ 2 ┆ 6 │ - │ 3 ┆ 7 │ - └──────┴──────┘ - - Pass a negative value to shift in the opposite direction instead. - - >>> df.shift(-2) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ null ┆ null │ - │ null ┆ null │ - └──────┴──────┘ - - Specify `fill_value` to fill the resulting null values. - - >>> df.shift(-2, fill_value=100) - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ 100 ┆ 100 │ - │ 100 ┆ 100 │ - └─────┴─────┘ - - ''' - def is_duplicated(self) -> Series: - ''' - Get a mask of all duplicated rows in this DataFrame. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) - >>> df.is_duplicated() - shape: (4,) - Series: \'\' [bool] - [ - true - false - false - true - ] - - This mask can be used to visualize the duplicated lines like this: - - >>> df.filter(df.is_duplicated()) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ str │ - ╞═════╪═════╡ - │ 1 ┆ x │ - │ 1 ┆ x │ - └─────┴─────┘ - ''' - def is_unique(self) -> Series: - ''' - Get a mask of all unique rows in this DataFrame. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) - >>> df.is_unique() - shape: (4,) - Series: \'\' [bool] - [ - false - true - true - false - ] - - This mask can be used to visualize the unique lines like this: - - >>> df.filter(df.is_unique()) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ str │ - ╞═════╪═════╡ - │ 2 ┆ y │ - │ 3 ┆ z │ - └─────┴─────┘ - ''' - def lazy(self) -> LazyFrame: - ''' - Start a lazy query from this point. This returns a `LazyFrame` object. - - Operations on a `LazyFrame` are not executed until this is requested by either - calling: - - * :meth:`.fetch() ` - (run on a small number of rows) - * :meth:`.collect() ` - (run on all data) - * :meth:`.describe_plan() ` - (print unoptimized query plan) - * :meth:`.describe_optimized_plan() ` - (print optimized query plan) - * :meth:`.show_graph() ` - (show (un)optimized query plan as graphviz graph) - - Lazy operations are advised because they allow for query optimization and more - parallelization. - - Returns - ------- - LazyFrame - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> df.lazy() # doctest: +ELLIPSIS - - - ''' - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - ''' - Select columns from this DataFrame. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Examples - -------- - Pass the name of a column to select that column. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.select("foo") - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - Multiple columns can be selected by passing a list of column names. - - >>> df.select(["foo", "bar"]) - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 6 │ - │ 2 ┆ 7 │ - │ 3 ┆ 8 │ - └─────┴─────┘ - - Multiple columns can also be selected using positional arguments instead of a - list. Expressions are also accepted. - - >>> df.select(pl.col("foo"), pl.col("bar") + 1) - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - └─────┴─────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) - shape: (3, 1) - ┌───────────┐ - │ threshold │ - │ --- │ - │ i32 │ - ╞═══════════╡ - │ 0 │ - │ 0 │ - │ 10 │ - └───────────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... df.select( - ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), - ... ) - ... - shape: (3, 1) - ┌───────────┐ - │ is_odd │ - │ --- │ - │ struct[2] │ - ╞═══════════╡ - │ {1,0} │ - │ {0,1} │ - │ {1,0} │ - └───────────┘ - - ''' - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - """ - Select columns from this LazyFrame. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - See Also - -------- - select - - """ - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - ''' - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - DataFrame - A new DataFrame with the columns added. - - Notes - ----- - Creating a new DataFrame using this method does not create a new copy of - existing data. - - Examples - -------- - Pass an expression to add it as a new column. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) - shape: (4, 4) - ┌─────┬──────┬───────┬──────┐ - │ a ┆ b ┆ c ┆ a^2 │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 │ - ╞═════╪══════╪═══════╪══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ - └─────┴──────┴───────┴──────┘ - - Added columns will replace existing columns with the same name. - - >>> df.with_columns(pl.col("a").cast(pl.Float64)) - shape: (4, 3) - ┌─────┬──────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╡ - │ 1.0 ┆ 0.5 ┆ true │ - │ 2.0 ┆ 4.0 ┆ true │ - │ 3.0 ┆ 10.0 ┆ false │ - │ 4.0 ┆ 13.0 ┆ true │ - └─────┴──────┴───────┘ - - Multiple columns can be added by passing a list of expressions. - - >>> df.with_columns( - ... [ - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ] - ... ) - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Multiple columns also can be added using positional arguments instead of a list. - - >>> df.with_columns( - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ) - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> df.with_columns( - ... ab=pl.col("a") * pl.col("b"), - ... not_c=pl.col("c").not_(), - ... ) - shape: (4, 5) - ┌─────┬──────┬───────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ ab ┆ not_c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ - └─────┴──────┴───────┴──────┴───────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... df.drop("c").with_columns( - ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), - ... ) - ... - shape: (4, 3) - ┌─────┬──────┬─────────────┐ - │ a ┆ b ┆ diffs │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ struct[2] │ - ╞═════╪══════╪═════════════╡ - │ 1 ┆ 0.5 ┆ {null,null} │ - │ 2 ┆ 4.0 ┆ {1,3.5} │ - │ 3 ┆ 10.0 ┆ {1,6.0} │ - │ 4 ┆ 13.0 ┆ {1,3.0} │ - └─────┴──────┴─────────────┘ - - ''' - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: - """ - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - LazyFrame - A new LazyFrame with the columns added. - - See Also - -------- - with_columns - - """ - def n_chunks(self, strategy: str = ...) -> int | list[int]: - ''' - Get number of chunks used by the ChunkedArrays of this DataFrame. - - Parameters - ---------- - strategy : {\'first\', \'all\'} - Return the number of chunks of the \'first\' column, - or \'all\' columns in this DataFrame. - - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> df.n_chunks() - 1 - >>> df.n_chunks(strategy="all") - [1, 1, 1] - - ''' - def max(self, axis: int | None = ...) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their maximum value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`max_horizontal`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.max() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def max_horizontal(self) -> Series: - ''' - Get the maximum value horizontally across columns. - - Returns - ------- - Series - A Series named `"max"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.max_horizontal() - shape: (3,) - Series: \'max\' [f64] - [ - 4.0 - 5.0 - 6.0 - ] - ''' - def min(self, axis: int | None = ...) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their minimum value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`min_horizontal`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.min() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - ''' - def min_horizontal(self) -> Series: - ''' - Get the minimum value horizontally across columns. - - Returns - ------- - Series - A Series named `"min"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.min_horizontal() - shape: (3,) - Series: \'min\' [f64] - [ - 1.0 - 2.0 - 3.0 - ] - ''' - def sum(self) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their sum value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`sum_horizontal`. - null_strategy : {\'ignore\', \'propagate\'} - This argument is only used if `axis == 1`. - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.sum() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 6 ┆ 21 ┆ null │ - └─────┴─────┴──────┘ - ''' - def sum_horizontal(self) -> Series: - ''' - Sum all values horizontally across columns. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - If set to `False`, any null value in the input will lead to a null output. - - Returns - ------- - Series - A Series named `"sum"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.sum_horizontal() - shape: (3,) - Series: \'sum\' [f64] - [ - 5.0 - 7.0 - 9.0 - ] - ''' - def mean(self) -> Self | Series: - ''' - Aggregate the columns of this DataFrame to their mean value. - - Parameters - ---------- - axis - Either 0 (vertical) or 1 (horizontal). - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. This method will only - support vertical aggregation, as if `axis` were set to `0`. - To perform horizontal aggregation, use :meth:`mean_horizontal`. - null_strategy : {\'ignore\', \'propagate\'} - This argument is only used if `axis == 1`. - - .. deprecated:: 0.19.14 - This argument will be removed in a future version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... "spam": [True, False, None], - ... } - ... ) - >>> df.mean() - shape: (1, 4) - ┌─────┬─────┬──────┬──────┐ - │ foo ┆ bar ┆ ham ┆ spam │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str ┆ f64 │ - ╞═════╪═════╪══════╪══════╡ - │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ - └─────┴─────┴──────┴──────┘ - ''' - def mean_horizontal(self) -> Series: - ''' - Take the mean of all values horizontally across columns. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - If set to `False`, any null value in the input will lead to a null output. - - Returns - ------- - Series - A Series named `"mean"`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [4.0, 5.0, 6.0], - ... } - ... ) - >>> df.mean_horizontal() - shape: (3,) - Series: \'mean\' [f64] - [ - 2.5 - 3.5 - 4.5 - ] - ''' - def std(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns of this DataFrame to their standard deviation value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.std() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1.0 ┆ 1.0 ┆ null │ - └─────┴─────┴──────┘ - >>> df.std(ddof=0) - shape: (1, 3) - ┌──────────┬──────────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞══════════╪══════════╪══════╡ - │ 0.816497 ┆ 0.816497 ┆ null │ - └──────────┴──────────┴──────┘ - - ''' - def var(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns of this DataFrame to their variance value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.var() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1.0 ┆ 1.0 ┆ null │ - └─────┴─────┴──────┘ - >>> df.var(ddof=0) - shape: (1, 3) - ┌──────────┬──────────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞══════════╪══════════╪══════╡ - │ 0.666667 ┆ 0.666667 ┆ null │ - └──────────┴──────────┴──────┘ - - ''' - def median(self) -> Self: - ''' - Aggregate the columns of this DataFrame to their median value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.median() - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 2.0 ┆ 7.0 ┆ null │ - └─────┴─────┴──────┘ - - ''' - def product(self) -> DataFrame: - ''' - Aggregate the columns of this DataFrame to their product values. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": [0.5, 4, 10], - ... "c": [True, True, False], - ... } - ... ) - - >>> df.product() - shape: (1, 3) - ┌─────┬──────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ i64 │ - ╞═════╪══════╪═════╡ - │ 6 ┆ 20.0 ┆ 0 │ - └─────┴──────┴─────┘ - - ''' - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: - ''' - Aggregate the columns of this DataFrame to their quantile value. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.quantile(0.5, "nearest") - shape: (1, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 2.0 ┆ 7.0 ┆ null │ - └─────┴─────┴──────┘ - - ''' - def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: - ''' - Convert categorical variables into dummy/indicator variables. - - Parameters - ---------- - columns - Column name(s) or selector(s) that should be converted to dummy - variables. If set to `None` (default), convert all columns. - separator - Separator/delimiter used when generating column names. - drop_first - Remove the first category from the variables being encoded. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2], - ... "bar": [3, 4], - ... "ham": ["a", "b"], - ... } - ... ) - >>> df.to_dummies() - shape: (2, 6) - ┌───────┬───────┬───────┬───────┬───────┬───────┐ - │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ - ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ - │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ - │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ - └───────┴───────┴───────┴───────┴───────┴───────┘ - - >>> df.to_dummies(drop_first=True) - shape: (2, 3) - ┌───────┬───────┬───────┐ - │ foo_2 ┆ bar_4 ┆ ham_b │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 │ - ╞═══════╪═══════╪═══════╡ - │ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1 ┆ 1 │ - └───────┴───────┴───────┘ - - >>> import polars.selectors as cs - >>> df.to_dummies(cs.integer(), separator=":") - shape: (2, 5) - ┌───────┬───────┬───────┬───────┬─────┐ - │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ - ╞═══════╪═══════╪═══════╪═══════╪═════╡ - │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ - │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ - └───────┴───────┴───────┴───────┴─────┘ - - >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") - shape: (2, 3) - ┌───────┬───────┬─────┐ - │ foo:2 ┆ bar:4 ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ str │ - ╞═══════╪═══════╪═════╡ - │ 0 ┆ 0 ┆ a │ - │ 1 ┆ 1 ┆ b │ - └───────┴───────┴─────┘ - - ''' - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: - ''' - Drop duplicate rows from this dataframe. - - Parameters - ---------- - subset - Column name(s) or selector(s), to consider when identifying - duplicate rows. If set to `None` (default), use all columns. - keep : {\'first\', \'last\', \'any\', \'none\'} - Which of the duplicate rows to keep. - - * \'any\': Does not give any guarantee of which row is kept. - This allows more optimizations. - * \'none\': Don\'t keep duplicate rows. - * \'first\': Keep first unique row. - * \'last\': Keep last unique row. - maintain_order - Keep the same order as the original DataFrame. This is more expensive to - compute. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - Returns - ------- - DataFrame - DataFrame with unique rows. - - Warnings - -------- - This method will fail if there is a column of type `List` in the DataFrame or - subset. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3, 1], - ... "bar": ["a", "a", "a", "a"], - ... "ham": ["b", "b", "b", "b"], - ... } - ... ) - >>> df.unique(maintain_order=True) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> df.unique(subset=["bar", "ham"], maintain_order=True) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> df.unique(keep="last", maintain_order=True) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - - ''' - def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: - ''' - Return the number of unique rows, or the number of unique row-subsets. - - Parameters - ---------- - subset - One or more columns/expressions that define what to count; - omit to return the count of unique rows. - - Notes - ----- - This method operates at the `DataFrame` level; to operate on subsets at the - expression level you can make use of struct-packing instead, for example: - - >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() - - If instead you want to count the number of unique values per-column, you can - also use expression-level syntax to return a new frame containing that result: - - >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) - >>> df_nunique = df.select(pl.all().n_unique()) - - In aggregate context there is also an equivalent method for returning the - unique values per-group: - - >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 1, 2, 3, 4, 5], - ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], - ... "c": [True, True, True, False, True, True], - ... } - ... ) - >>> df.n_unique() - 5 - - Simple columns subset. - - >>> df.n_unique(subset=["b", "c"]) - 4 - - Expression subset. - - >>> df.n_unique( - ... subset=[ - ... (pl.col("a") // 2), - ... (pl.col("c") | (pl.col("b") >= 2)), - ... ], - ... ) - 3 - - ''' - def approx_n_unique(self) -> DataFrame: - ''' - Approximate count of unique values. - - This is done using the HyperLogLog++ algorithm for cardinality estimation. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> df.approx_n_unique() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def approx_unique(self) -> DataFrame: - """ - Approximate count of unique values. - - .. deprecated:: 0.18.12 - This method has been renamed to :func:`DataFrame.approx_n_unique`. - - """ - def rechunk(self) -> Self: - """ - Rechunk the data in this DataFrame to a contiguous allocation. - - This will make sure all subsequent operations have optimal and predictable - performance. - """ - def null_count(self) -> Self: - ''' - Create a new DataFrame that shows the null counts per column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, None, 3], - ... "bar": [6, 7, None], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.null_count() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ u32 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 1 ┆ 0 │ - └─────┴─────┴─────┘ - - ''' - def sample(self, n: int | Series | None = ...) -> Self: - ''' - Sample from this DataFrame. - - Parameters - ---------- - n - Number of items to return. Cannot be used with `fraction`. Defaults to 1 if - `fraction` is None. - fraction - Fraction of items to return. Cannot be used with `n`. - with_replacement - Allow values to be sampled more than once. - shuffle - If set to True, the order of the sampled rows will be shuffled. If - set to False (default), the order of the returned rows will be - neither stable nor fully random. - seed - Seed for the random number generator. If set to None (default), a - random seed is generated for each sample operation. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8 ┆ c │ - │ 2 ┆ 7 ┆ b │ - └─────┴─────┴─────┘ - - ''' - def fold(self, operation: Callable[[Series, Series], Series]) -> Series: - ''' - Apply a horizontal reduction on a DataFrame. - - This can be used to effectively determine aggregations on a row level, and can - be applied to any DataType that can be supercasted (casted to a similar parent - type). - - An example of the supercast rules when applying an arithmetic operation on two - DataTypes are for instance: - - - Int8 + Utf8 = Utf8 - - Float32 + Int64 = Float32 - - Float32 + Float64 = Float64 - - Examples - -------- - A horizontal sum operation: - - >>> df = pl.DataFrame( - ... { - ... "a": [2, 1, 3], - ... "b": [1, 2, 3], - ... "c": [1.0, 2.0, 3.0], - ... } - ... ) - >>> df.fold(lambda s1, s2: s1 + s2) - shape: (3,) - Series: \'a\' [f64] - [ - 4.0 - 5.0 - 9.0 - ] - - A horizontal minimum operation: - - >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) - >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) - shape: (3,) - Series: \'a\' [f64] - [ - 1.0 - 1.0 - 3.0 - ] - - A horizontal string concatenation: - - >>> df = pl.DataFrame( - ... { - ... "a": ["foo", "bar", 2], - ... "b": [1, 2, 3], - ... "c": [1.0, 2.0, 3.0], - ... } - ... ) - >>> df.fold(lambda s1, s2: s1 + s2) - shape: (3,) - Series: \'a\' [str] - [ - "foo11.0" - "bar22.0" - null - ] - - A horizontal boolean or, similar to a row-wise .any(): - - >>> df = pl.DataFrame( - ... { - ... "a": [False, False, True], - ... "b": [False, True, False], - ... } - ... ) - >>> df.fold(lambda s1, s2: s1 | s2) - shape: (3,) - Series: \'a\' [bool] - [ - false - true - true - ] - - Parameters - ---------- - operation - function that takes two `Series` and returns a `Series`. - - ''' - def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: - ''' - Get the values of a single row, either by index or by predicate. - - Parameters - ---------- - index - Row index. - by_predicate - Select the row according to a given expression/predicate. - named - Return a dictionary instead of a tuple. The dictionary is a mapping of - column name to row value. This is more expensive than returning a regular - tuple, but allows for accessing values by column name. - - Returns - ------- - tuple (default) or dictionary of row values - - Notes - ----- - The `index` and `by_predicate` params are mutually exclusive. Additionally, - to ensure clarity, the `by_predicate` parameter must be supplied by keyword. - - When using `by_predicate` it is an error condition if anything other than - one row is returned; more than one row raises `TooManyRowsReturnedError`, and - zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). - - Warnings - -------- - You should NEVER use this method to iterate over a DataFrame; if you require - row-iteration you should strongly prefer use of `iter_rows()` instead. - - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - rows : Materialise all frame data as a list of rows (potentially expensive). - item: Return dataframe element as a scalar. - - Examples - -------- - Specify an index to return the row at the given index as a tuple. - - >>> df = pl.DataFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df.row(2) - (3, 8, \'c\') - - Specify `named=True` to get a dictionary instead with a mapping of column - names to row values. - - >>> df.row(2, named=True) - {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} - - Use `by_predicate` to return the row that matches the given predicate. - - >>> df.row(by_predicate=(pl.col("ham") == "b")) - (2, 7, \'b\') - - ''' - def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: - ''' - Returns all data in the DataFrame as a list of rows of python-native values. - - Parameters - ---------- - named - Return dictionaries instead of tuples. The dictionaries are a mapping of - column name to row value. This is more expensive than returning a regular - tuple, but allows for accessing values by column name. - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - Warnings - -------- - Row-iteration is not optimal as the underlying data is stored in columnar form; - where possible, prefer export via one of the dedicated export/output methods. - Where possible you should also consider using `iter_rows` instead to avoid - materialising all the data at once. - - Returns - ------- - list of tuples (default) or dictionaries of row values - - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - rows_by_key : Materialises frame data as a key-indexed dictionary. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "x": ["a", "b", "b", "a"], - ... "y": [1, 2, 3, 4], - ... "z": [0, 3, 6, 9], - ... } - ... ) - >>> df.rows() - [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] - >>> df.rows(named=True) - [{\'x\': \'a\', \'y\': 1, \'z\': 0}, - {\'x\': \'b\', \'y\': 2, \'z\': 3}, - {\'x\': \'b\', \'y\': 3, \'z\': 6}, - {\'x\': \'a\', \'y\': 4, \'z\': 9}] - - ''' - def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: - ''' - Returns DataFrame data as a keyed dictionary of python-native values. - - Note that this method should not be used in place of native operations, due to - the high cost of materialising all frame data out into a dictionary; it should - be used only when you need to move the values out into a Python data structure - or other object that cannot operate directly with Polars/Arrow. - - Parameters - ---------- - key - The column(s) to use as the key for the returned dictionary. If multiple - columns are specified, the key will be a tuple of those values, otherwise - it will be a string. - named - Return dictionary rows instead of tuples, mapping column name to row value. - include_key - Include key values inline with the associated data (by default the key - values are omitted as a memory/performance optimisation, as they can be - reoconstructed from the key). - unique - Indicate that the key is unique; this will result in a 1:1 mapping from - key to a single associated row. Note that if the key is *not* actually - unique the last row with the given key will be returned. - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - See Also - -------- - rows : Materialise all frame data as a list of rows (potentially expensive). - iter_rows : Row iterator over frame data (does not materialise all rows). - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "w": ["a", "b", "b", "a"], - ... "x": ["q", "q", "q", "k"], - ... "y": [1.0, 2.5, 3.0, 4.5], - ... "z": [9, 8, 7, 6], - ... } - ... ) - - Group rows by the given key column(s): - - >>> df.rows_by_key(key=["w"]) - defaultdict(, - {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], - \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) - - Return the same row groupings as dictionaries: - - >>> df.rows_by_key(key=["w"], named=True) - defaultdict(, - {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, - {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], - \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, - {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) - - Return row groupings, assuming keys are unique: - - >>> df.rows_by_key(key=["z"], unique=True) - {9: (\'a\', \'q\', 1.0), - 8: (\'b\', \'q\', 2.5), - 7: (\'b\', \'q\', 3.0), - 6: (\'a\', \'k\', 4.5)} - - Return row groupings as dictionaries, assuming keys are unique: - - >>> df.rows_by_key(key=["z"], named=True, unique=True) - {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, - 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, - 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, - 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} - - Return dictionary rows grouped by a compound key, including key values: - - >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) - defaultdict(, - {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], - (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, - {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], - (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) - - ''' - def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: - ''' - Returns an iterator over the DataFrame of rows of python-native values. - - Parameters - ---------- - named - Return dictionaries instead of tuples. The dictionaries are a mapping of - column name to row value. This is more expensive than returning a regular - tuple, but allows for accessing values by column name. - buffer_size - Determines the number of rows that are buffered internally while iterating - over the data; you should only modify this in very specific cases where the - default value is determined not to be a good fit to your access pattern, as - the speedup from using the buffer is significant (~2-4x). Setting this - value to zero disables row buffering (not recommended). - - Notes - ----- - If you have `ns`-precision temporal values you should be aware that Python - natively only supports up to `μs`-precision; `ns`-precision values will be - truncated to microseconds on conversion to Python. If this matters to your - use-case you should export to a different format (such as Arrow or NumPy). - - Warnings - -------- - Row iteration is not optimal as the underlying data is stored in columnar form; - where possible, prefer export via one of the dedicated export/output methods - that deals with columnar data. - - Returns - ------- - iterator of tuples (default) or dictionaries (if named) of python row values - - See Also - -------- - rows : Materialises all frame data as a list of rows (potentially expensive). - rows_by_key : Materialises frame data as a key-indexed dictionary. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> [row[0] for row in df.iter_rows()] - [1, 3, 5] - >>> [row["b"] for row in df.iter_rows(named=True)] - [2, 4, 6] - - ''' - def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: - ''' - Returns a non-copying iterator of slices over the underlying DataFrame. - - Parameters - ---------- - n_rows - Determines the number of rows contained in each DataFrame slice. - - Examples - -------- - >>> from datetime import date - >>> df = pl.DataFrame( - ... data={ - ... "a": range(17_500), - ... "b": date(2023, 1, 1), - ... "c": "klmnoopqrstuvwxyz", - ... }, - ... schema_overrides={"a": pl.Int32}, - ... ) - >>> for idx, frame in enumerate(df.iter_slices()): - ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") - ... - DataFrame:[0]:10000 - DataFrame:[1]:7500 - - Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and - any supported frame export/conversion types; for example, as RecordBatches: - - >>> for frame in df.iter_slices(n_rows=15_000): - ... record_batch = frame.to_arrow().to_batches()[0] - ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") - ... - a: int32 - b: date32[day] - c: large_string - << 15000 - a: int32 - b: date32[day] - c: large_string - << 2500 - - See Also - -------- - iter_rows : Row iterator over frame data (does not materialise all rows). - partition_by : Split into multiple DataFrames, partitioned by groups. - - ''' - def shrink_to_fit(self) -> Self: - """ - Shrink DataFrame memory usage. - - Shrinks to fit the exact capacity needed to hold the data. - - """ - def gather_every(self, n: int) -> DataFrame: - ''' - Take every nth row in the DataFrame and return as a new DataFrame. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) - >>> s.gather_every(2) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 5 │ - │ 3 ┆ 7 │ - └─────┴─────┘ - - ''' - def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: - ''' - Hash and combine the rows in this DataFrame. - - The hash value is of type `UInt64`. - - Parameters - ---------- - seed - Random seed parameter. Defaults to 0. - seed_1 - Random seed parameter. Defaults to `seed` if not set. - seed_2 - Random seed parameter. Defaults to `seed` if not set. - seed_3 - Random seed parameter. Defaults to `seed` if not set. - - Notes - ----- - This implementation of :func:`hash_rows` does not guarantee stable results - across different Polars versions. Its stability is only guaranteed within a - single version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, None, 3, 4], - ... "ham": ["a", "b", None, "d"], - ... } - ... ) - >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT - shape: (4,) - Series: \'\' [u64] - [ - 10783150408545073287 - 1438741209321515184 - 10047419486152048166 - 2047317070637311557 - ] - - ''' - def interpolate(self) -> DataFrame: - ''' - Interpolate intermediate values. The interpolation method is linear. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "foo": [1, None, 9, 10], - ... "bar": [6, 7, 9, None], - ... "baz": [1, None, None, 9], - ... } - ... ) - >>> df.interpolate() - shape: (4, 3) - ┌──────┬──────┬──────────┐ - │ foo ┆ bar ┆ baz │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 │ - ╞══════╪══════╪══════════╡ - │ 1.0 ┆ 6.0 ┆ 1.0 │ - │ 5.0 ┆ 7.0 ┆ 3.666667 │ - │ 9.0 ┆ 9.0 ┆ 6.333333 │ - │ 10.0 ┆ null ┆ 9.0 │ - └──────┴──────┴──────────┘ - - ''' - def is_empty(self) -> bool: - ''' - Check if the dataframe is empty. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df.is_empty() - False - >>> df.filter(pl.col("foo") > 99).is_empty() - True - - ''' - def to_struct(self, name: str) -> Series: - ''' - Convert a `DataFrame` to a `Series` of type `Struct`. - - Parameters - ---------- - name - Name for the struct Series - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 4, 5], - ... "b": ["one", "two", "three", "four", "five"], - ... } - ... ) - >>> df.to_struct("nums") - shape: (5,) - Series: \'nums\' [struct[2]] - [ - {1,"one"} - {2,"two"} - {3,"three"} - {4,"four"} - {5,"five"} - ] - - ''' - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: - ''' - Decompose struct columns into separate columns for each of their fields. - - The new columns will be inserted into the dataframe at the location of the - struct column. - - Parameters - ---------- - columns - Name of the struct column(s) that should be unnested. - *more_columns - Additional columns to unnest, specified as positional arguments. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "before": ["foo", "bar"], - ... "t_a": [1, 2], - ... "t_b": ["a", "b"], - ... "t_c": [True, None], - ... "t_d": [[1, 2], [3]], - ... "after": ["baz", "womp"], - ... } - ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") - >>> df - shape: (2, 3) - ┌────────┬─────────────────────┬───────┐ - │ before ┆ t_struct ┆ after │ - │ --- ┆ --- ┆ --- │ - │ str ┆ struct[4] ┆ str │ - ╞════════╪═════════════════════╪═══════╡ - │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ - │ bar ┆ {2,"b",null,[3]} ┆ womp │ - └────────┴─────────────────────┴───────┘ - >>> df.unnest("t_struct") - shape: (2, 6) - ┌────────┬─────┬─────┬──────┬───────────┬───────┐ - │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ - ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ - │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ - │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ - └────────┴─────┴─────┴──────┴───────────┴───────┘ - - ''' - def corr(self, **kwargs: Any) -> DataFrame: - ''' - Return pairwise Pearson product-moment correlation coefficients between columns. - - See numpy `corrcoef` for more information: - https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html - - Notes - ----- - This functionality requires numpy to be installed. - - Parameters - ---------- - **kwargs - Keyword arguments are passed to numpy `corrcoef`. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) - >>> df.corr() - shape: (3, 3) - ┌──────┬──────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 │ - ╞══════╪══════╪══════╡ - │ 1.0 ┆ -1.0 ┆ 1.0 │ - │ -1.0 ┆ 1.0 ┆ -1.0 │ - │ 1.0 ┆ -1.0 ┆ 1.0 │ - └──────┴──────┴──────┘ - - ''' - def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: - ''' - Take two sorted DataFrames and merge them by the sorted key. - - The output of this operation will also be sorted. - It is the callers responsibility that the frames are sorted - by that key otherwise the output will not make sense. - - The schemas of both DataFrames must be equal. - - Parameters - ---------- - other - Other DataFrame that must be merged - key - Key that is sorted. - - Examples - -------- - >>> df0 = pl.DataFrame( - ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} - ... ).sort("age") - >>> df0 - shape: (3, 2) - ┌───────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═══════╪═════╡ - │ bob ┆ 18 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └───────┴─────┘ - >>> df1 = pl.DataFrame( - ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} - ... ).sort("age") - >>> df1 - shape: (4, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - └────────┴─────┘ - >>> df0.merge_sorted(df1, key="age") - shape: (7, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ bob ┆ 18 │ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └────────┴─────┘ - ''' - def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: - """ - Indicate that one or multiple columns are sorted. - - Parameters - ---------- - column - Columns that are sorted - more_columns - Additional columns that are sorted, specified as positional arguments. - descending - Whether the columns are sorted in descending order. - """ - def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> DataFrame: - ''' - Update the values in this `DataFrame` with the values in `other`. - - By default, null values in the right dataframe are ignored. Use - `ignore_nulls=False` to overwrite values in this frame with null values in other - frame. - - Notes - ----- - This is syntactic sugar for a left/inner join, with an optional coalesce when - `include_nulls = False`. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Parameters - ---------- - other - DataFrame that will be used to update the values - on - Column names that will be joined on. - If none given the row count is used. - left_on - Join column(s) of the left DataFrame. - right_on - Join column(s) of the right DataFrame. - how : {\'left\', \'inner\', \'outer\'} - * \'left\' will keep all rows from the left table; rows may be duplicated - if multiple rows in the right frame match the left row\'s key. - * \'inner\' keeps only those rows where the key exists in both frames. - * \'outer\' will update existing rows where the key matches while also - adding any new rows contained in the given frame. - include_nulls - If True, null values from the right dataframe will be used to update the - left dataframe. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4], - ... "B": [400, 500, 600, 700], - ... } - ... ) - >>> df - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 400 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - >>> new_df = pl.DataFrame( - ... { - ... "B": [-66, None, -99], - ... "C": [5, 3, 1], - ... } - ... ) - - Update `df` values with the non-null values in `new_df`, by row index: - - >>> df.update(new_df) - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, by row index, - but only keeping those rows that are common to both frames: - - >>> df.update(new_df, how="inner") - shape: (3, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") - shape: (5, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴─────┘ - - Update `df` values including null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> df.update( - ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True - ... ) - shape: (5, 2) - ┌─────┬──────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ null │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴──────┘ - - ''' - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: - """ - Start a group by operation. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.group_by`. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - .. note:: - Within each group, the order of rows is always preserved, regardless - of this argument. - - Returns - ------- - GroupBy - Object which can be used to perform aggregations. - - """ - def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - """ - def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.9 - This method has been renamed to :func:`DataFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - """ - def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.group_by_dynamic`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - include_boundaries - Add the lower and upper bound of the window to the "_lower_bound" and - "_upper_bound" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - DynamicGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - ''' - def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: - """ - Apply a custom/user-defined function (UDF) over the rows of the DataFrame. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`DataFrame.map_rows`. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output type of the operation. If none given, Polars tries to infer the type. - inference_size - Only used in the case when the custom function returns rows. - This uses the first `n` rows to determine the output schema - - """ - def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - fill None values with this value. - n - Number of places to shift (may be negative). - - """ - def take_every(self, n: int) -> DataFrame: - """ - Take every nth row in the DataFrame and return as a new DataFrame. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - def find_idx_by_name(self, name: str) -> int: - """ - Find the index of a column by name. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`get_column_index`. - - Parameters - ---------- - name - Name of the column to find. - """ - def insert_at_idx(self, index: int, column: Series) -> Self: - """ - Insert a Series at a certain column index. This operation is in place. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`insert_column`. - - Parameters - ---------- - index - Column to insert the new `Series` column. - column - `Series` to insert. - """ - def replace_at_idx(self, index: int, new_column: Series) -> Self: - """ - Replace a column at an index location. - - .. deprecated:: 0.19.14 - This method has been renamed to :func:`replace_column`. - - Parameters - ---------- - index - Column index. - new_column - Series that will replace the column. - """ - def frame_equal(self, other: DataFrame) -> bool: - """ - Check whether the DataFrame is equal to another DataFrame. - - .. deprecated:: 0.19.16 - This method has been renamed to :func:`equals`. - - Parameters - ---------- - other - DataFrame to compare with. - null_equal - Consider null values as equal. - """ - @property - def shape(self): ... - @property - def height(self): ... - @property - def width(self): ... - @property - def dtypes(self): ... - @property - def flags(self): ... - @property - def schema(self): ... -def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/dataframe/frame.pyi new file mode 100644 index 0000000..8b4310f --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/dataframe/frame.pyi @@ -0,0 +1,7123 @@ +#: version 0.20.3 +import P +import deltalake +import deltalake.table +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Enum as Enum, Float64 as Float64, Null as Null, Object as Object, String as String, Unknown as Unknown +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, hvplot as hvplot +from polars.exceptions import ModuleUpgradeRequired as ModuleUpgradeRequired, NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, frame_to_pydf as frame_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, _warn_null_comparison as _warn_null_comparison, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, IO, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_HVPLOT_AVAILABLE: bool +_PANDAS_AVAILABLE: bool +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + + """ + @classmethod + def _read_csv(cls, source: str | Path | IO[bytes] | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use `pl.read_csv` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + + """ + @classmethod + def _read_parquet(cls, source: str | Path | IO[bytes] | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use `pl.read_parquet` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading `n_rows`. + + """ + @classmethod + def _read_ipc(cls, source: str | Path | IO[bytes] | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | IO[bytes] | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading `n_rows`. + row_count_name + Row count name. + row_count_offset + Row count offset. + rechunk + Make sure that all data is contiguous. + + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use `pl.read_json` to dispatch to this method. + + See Also + -------- + polars.io.read_json + + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use `pl.read_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with `NaN`. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to `True` will raise a `NotImplementedError`. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars DataFrame to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the DataFrame as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to `df[0,0]`, with a check that + the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + + ''' + def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are Series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + `structured` is set to `False` and the DataFrame dtypes allow for a + global dtype for all columns. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + function for the conversion to numpy if necessary. + + Notes + ----- + If you\'re attempting to convert String or Decimal to an array, you\'ll need to + install `pyarrow`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.String), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.String), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + separator or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path or writeable file-like object to which the data will be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + name + Schema name. Defaults to empty string. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open `xlsxwriter.Workbook` object that has not been closed. + If None, writes to a `dataframe.xlsx` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of `{"key":value,}` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. + column_formats : dict + A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. + dtype_formats : dict + A `{dtype:str,}` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + `column_formats` param). It is also valid to use dtype groups such as + `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid `xlsxwriter` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all `xlsxwriter` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A `{key:value,}` dictionary of `xlsxwriter` format options to apply + to the table header row, such as `{"bold":True, "font_color":"#702963"}`. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a `{colname:funcname,}` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A `{colname:int,}` or `{selector:int,}` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a `{colname:columns,}` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or `{row_index:int,}` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that `row_index` starts at zero and will be + the header row (unless `include_header` is False). + sparklines : dict + A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an `xlsxwriter`-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + include_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible `xlsxwriter` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic DataFrame: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC data will be + written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC record batch data will + be written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + data_page_size + Size of the data page in bytes. Defaults to 1024^2 bytes. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to `pyarrow.parquet.write_table`. + + If you pass `partition_cols` here, the dataset will be written + using `pyarrow.parquet.write_to_dataset`. + The `partition_cols` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + + ''' + def write_database(self, table_name: str, connection: str) -> int: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Schema-qualified name of the table to create or append to in the target + SQL database. If your table name contains special characters, it should + be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_table_exists : {\'append\', \'replace\', \'fail\'} + The insert mode: + + * \'replace\' will create a new database table, overwriting an existing one. + * \'append\' will append to an existing table. + * \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine to use for writing frame data. + + Returns + ------- + int + The number of rows affected, if the driver provides this information. + Otherwise, returns -1. + + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> deltalake.table.TableMerger | None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\', \'merge\'} + How to handle existing data. + + - If \'error\', throw an error if the table already exists (default). + - If \'append\', will add new data. + - If \'overwrite\', will replace table with new data. + - If \'ignore\', will not write anything if table already exists. + - If \'merge\', return a `TableMerger` object to merge data from the DataFrame + with the existing data. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + - See a list of supported storage options for S3 `here `__. + - See a list of supported storage options for GCS `here `__. + - See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + delta_merge_options + Keyword arguments which are required to `MERGE` a Delta lake Table. + See a list of supported merge options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + TableNotFoundError + If the delta table doesn\'t exist and MERGE action is triggered + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a DataFrame as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + Merge the DataFrame with an existing Delta Lake table. + For all `TableMerger` methods, check the deltalake docs + `here `__. + + Schema evolution is not yet supported in by the `deltalake` package, therefore + `overwrite_schema` will not have any effect on a merge operation. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> ( + ... df.write_delta( + ... "table_path", + ... mode="merge", + ... delta_merge_options={ + ... "predicate": "s.foo = t.foo", + ... "source_alias": "s", + ... "target_alias": "t", + ... }, + ... ) + ... .when_matched_update_all() + ... .when_not_matched_insert_all() + ... .execute() + ... ) # doctest: +SKIP + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.String)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["a", "b", "c"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["a", "b", "c"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 3 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 1 ┆ 2 ┆ 3 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["a", "b", "c"], col1=[1, 3, 2], col2=[3, 4, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 3 ┆ 4 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ col1 ┆ 1 ┆ 3 ┆ 2 │ + │ col2 ┆ 3 ┆ 4 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def insert_column(self, index: int, column: Series) -> Self: + ''' + Insert a Series at a certain column index. + + This operation is in place. + + Parameters + ---------- + index + Index at which to insert the new `Series` column. + column + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_column(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_column(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: + ''' + Filter the rows in the DataFrame based on one or more predicate expressions. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression(s) that evaluates to a boolean Series. + constraints + Column filters; use `name = value` to filter columns by the supplied value. + Each constraint will behave the same as `pl.col(name).eq(value)`, and + will be implicitly joined with the other filter conditions using `&`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") > 1) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions, combined with and/or operators: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> df.filter( + ... pl.col("foo") <= 2, + ... ~pl.col("ham").is_in(["b", "c"]), + ... ) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> df.filter(foo=2, ham="b") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Warnings + -------- + We will never guarantee the output of describe to be stable. + It will show statistics that we deem informative and may + be updated in the future. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "float": [1.0, 2.8, 3.0], + ... "int": [4, 5, None], + ... "bool": [True, False, True], + ... "str": [None, "b", "c"], + ... "str2": ["usd", "eur", None], + ... "date": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬───────┬──────┬──────┬────────────┐ + │ describe ┆ float ┆ int ┆ bool ┆ str ┆ str2 ┆ date │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ str ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪═══════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3 ┆ 2 ┆ 2 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ null ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ False ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 2.8 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ True ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴───────┴──────┴──────┴────────────┘ + + ''' + def get_column_index(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.get_column_index("ham") + 2 + + ''' + def replace_column(self, index: int, column: Series) -> Self: + ''' + Replace a column at an index location. + + This operation is in place. + + Parameters + ---------- + index + Column index. + column + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_column(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def equals(self, other: DataFrame) -> bool: + ''' + Check whether the DataFrame is equal to another DataFrame. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + See Also + -------- + assert_frame_equal + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.equals(df1) + True + >>> df1.equals(df2) + False + + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + head + + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The `GroupBy` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `group_by_dynamic` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling operation on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + interval will start \'every\' duration + offset + change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\', \'outer_coalesce\'} + Join strategy. + + * *inner* + Returns rows that have matching values in both tables + * *left* + Returns all rows from the left table, and the matched rows from the + right table + * *outer* + Returns all rows when there is a match in either left or right table + * *outer_coalesce* + Same as \'outer\', but coalesces the key columns + * *cross* + Returns the cartisian product of rows from both tables + * *semi* + Filter rows that have a match in the right table. + * *anti* + Filter rows that not have a match in the right table. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + join_nulls + Join on null values. By default null values will never produce matches. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 5) + ┌──────┬──────┬──────┬───────┬───────────┐ + │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞══════╪══════╪══════╪═══════╪═══════════╡ + │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │ + │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │ + │ null ┆ null ┆ null ┆ z ┆ d │ + │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │ + └──────┴──────┴──────┴───────┴───────────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see `pl.StringCache()`. + + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: `udf(row)`. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level `apply` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level `apply` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + + ''' + def vstack(self, other: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of + this `DataFrame`, `extend` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer `vstack` over `extend` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single `DataFrame`. In the latter case, finish the sequence of + `vstack` operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + columns + Names of the columns that should be removed from the dataframe, or + a selector that determines the columns to drop. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.String).to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.String}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this DataFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Returns + ------- + Series + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill `value`. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or String datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> df.melt(id_vars="a", value_vars=cs.numeric()) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to `None` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are the distinct + group values that identify that group. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying `as_dict=True`. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {\'a\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + \'b\': shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + \'c\': shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + + ''' + def shift(self, n: int = ...) -> DataFrame: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> df.shift() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.shift(-2) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.shift(-2, fill_value=100) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ) + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ) + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + + ''' + def max(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`max_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def max_horizontal(self) -> Series: + ''' + Get the maximum value horizontally across columns. + + Returns + ------- + Series + A Series named `"max"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.max_horizontal() + shape: (3,) + Series: \'max\' [f64] + [ + 4.0 + 5.0 + 6.0 + ] + ''' + def min(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`min_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + ''' + def min_horizontal(self) -> Series: + ''' + Get the minimum value horizontally across columns. + + Returns + ------- + Series + A Series named `"min"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.min_horizontal() + shape: (3,) + Series: \'min\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`sum_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + ''' + def sum_horizontal(self) -> Series: + ''' + Sum all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"sum"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.sum_horizontal() + shape: (3,) + Series: \'sum\' [f64] + [ + 5.0 + 7.0 + 9.0 + ] + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`mean_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + ''' + def mean_horizontal(self) -> Series: + ''' + Take the mean of all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"mean"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.mean_horizontal() + shape: (3,) + Series: \'mean\' [f64] + [ + 2.5 + 3.5 + 4.5 + ] + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to `None` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the `DataFrame` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + - Int8 + String = String + - Float32 + Int64 = Float32 + - Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The `index` and `by_predicate` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using `by_predicate` it is an error condition if anything other than + one row is returned; more than one row raises `TooManyRowsReturnedError`, and + zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of `iter_rows()` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify `named=True` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use `by_predicate` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using `iter_rows` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materialising all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialise all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialise all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + + ''' + def iter_columns(self) -> Iterator[Series]: + ''' + Returns an iterator over the DataFrame\'s columns. + + Notes + ----- + Consider whether you can use :func:`all` instead. + If you can, it will be more efficient. + + Returns + ------- + Iterator of Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [s.name for s in df.iter_columns()] + [\'a\', \'b\'] + + If you\'re using this to modify a dataframe\'s columns, e.g. + + >>> # Do NOT do this + >>> pl.DataFrame(column * 2 for column in df.iter_columns()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + + then consider whether you can use :func:`all` instead: + + >>> df.select(pl.all() * 2) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + + """ + def gather_every(self, n: int, offset: int = ...) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.gather_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + >>> s.gather_every(2, offset=1) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash_rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + + ''' + def to_struct(self, name: str = ...) -> Series: + ''' + Convert a `DataFrame` to a `Series` of type `Struct`. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy `corrcoef` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy `corrcoef`. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the values in `other`. + + .. warning:: + This functionality is experimental and may change without it being + considered a breaking change. + + By default, null values in the right frame are ignored. Use + `include_nulls=False` to overwrite values in this frame with + null values in the other frame. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. + If none given the row count is used. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce + when `include_nulls = False` + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> df.update(new_df, how="inner") + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update( + ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def count(self) -> DataFrame: + ''' + Return the number of non-null elements for each column. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"a": [1, 2, 3, 4], "b": [1, 2, 1, None], "c": [None, None, None, None]} + ... ) + >>> df.count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 3 ┆ 0 │ + └─────┴─────┴─────┘ + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + + """ + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with this value. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int, offset: int = ...) -> DataFrame: + """ + Take every nth row in the DataFrame and return as a new DataFrame. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + def find_idx_by_name(self, name: str) -> int: + """ + Find the index of a column by name. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`get_column_index`. + + Parameters + ---------- + name + Name of the column to find. + """ + def insert_at_idx(self, index: int, column: Series) -> Self: + """ + Insert a Series at a certain column index. This operation is in place. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`insert_column`. + + Parameters + ---------- + index + Column to insert the new `Series` column. + column + `Series` to insert. + """ + def replace_at_idx(self, index: int, new_column: Series) -> Self: + """ + Replace a column at an index location. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`replace_column`. + + Parameters + ---------- + index + Column index. + new_column + Series that will replace the column. + """ + def frame_equal(self, other: DataFrame) -> bool: + """ + Check whether the DataFrame is equal to another DataFrame. + + .. deprecated:: 0.19.16 + This method has been renamed to :func:`equals`. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + """ + @property + def plot(self): ... + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/expr/expr b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/expr/expr deleted file mode 100644 index 5131d44..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/expr/expr +++ /dev/null @@ -1,8289 +0,0 @@ -import P -import np as np -import pl -from builtins import PyExpr -from datetime import timedelta -from polars.datatypes.classes import Categorical as Categorical, Null as Null, Struct as Struct, UInt32 as UInt32, Utf8 as Utf8 -from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype -from polars.dependencies import _check_for_numpy as _check_for_numpy -from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning -from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace -from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace -from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace -from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace -from polars.expr.list import ExprListNameSpace as ExprListNameSpace -from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace -from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace -from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace -from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, warn_closed_future_change as warn_closed_future_change -from polars.utils.meta import threadpool_size as threadpool_size -from polars.utils.various import no_default as no_default, sphinx_accessor as sphinx_accessor -from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, NoReturn, Sequence - -TYPE_CHECKING: bool -py_arg_where: builtin_function_or_method -pyreduce: builtin_function_or_method - -class Expr: - _pyexpr: _ClassVar[None] = ... - _accessors: _ClassVar[set] = ... - @classmethod - def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... - def _to_pyexpr(self, other: Any) -> PyExpr: ... - def _repr_html_(self) -> str: ... - def __bool__(self) -> NoReturn: ... - def __abs__(self) -> Self: ... - def __add__(self, other: Any) -> Self: ... - def __radd__(self, other: Any) -> Self: ... - def __and__(self, other: Expr | int | bool) -> Self: ... - def __rand__(self, other: Any) -> Self: ... - def __eq__(self, other: Any) -> Self: ... - def __floordiv__(self, other: Any) -> Self: ... - def __rfloordiv__(self, other: Any) -> Self: ... - def __ge__(self, other: Any) -> Self: ... - def __gt__(self, other: Any) -> Self: ... - def __invert__(self) -> Self: ... - def __le__(self, other: Any) -> Self: ... - def __lt__(self, other: Any) -> Self: ... - def __mod__(self, other: Any) -> Self: ... - def __rmod__(self, other: Any) -> Self: ... - def __mul__(self, other: Any) -> Self: ... - def __rmul__(self, other: Any) -> Self: ... - def __ne__(self, other: Any) -> Self: ... - def __neg__(self) -> Expr: ... - def __or__(self, other: Expr | int | bool) -> Self: ... - def __ror__(self, other: Any) -> Self: ... - def __pos__(self) -> Expr: ... - def __pow__(self, power: int | float | Series | Expr) -> Self: ... - def __rpow__(self, base: int | float | Expr) -> Expr: ... - def __sub__(self, other: Any) -> Self: ... - def __rsub__(self, other: Any) -> Self: ... - def __truediv__(self, other: Any) -> Self: ... - def __rtruediv__(self, other: Any) -> Self: ... - def __xor__(self, other: Expr | int | bool) -> Self: ... - def __rxor__(self, other: Any) -> Self: ... - def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: - """Numpy universal functions.""" - @classmethod - def from_json(cls, value: str) -> Self: - """ - Read an expression from a JSON encoded string to construct an Expression. - - Parameters - ---------- - value - JSON encoded string value - - """ - def to_physical(self) -> Self: - ''' - Cast to physical representation of the logical dtype. - - - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` - - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` - - `List(inner)` -> `List(physical of inner)` - - Other data types will be left unchanged. - - Examples - -------- - Replicating the pandas - `pd.factorize - `_ - function. - - >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( - ... [ - ... pl.col("vals").cast(pl.Categorical), - ... pl.col("vals") - ... .cast(pl.Categorical) - ... .to_physical() - ... .alias("vals_physical"), - ... ] - ... ) - shape: (4, 2) - ┌──────┬───────────────┐ - │ vals ┆ vals_physical │ - │ --- ┆ --- │ - │ cat ┆ u32 │ - ╞══════╪═══════════════╡ - │ a ┆ 0 │ - │ x ┆ 1 │ - │ null ┆ null │ - │ a ┆ 0 │ - └──────┴───────────────┘ - - ''' - def any(self) -> Self: - ''' - Return whether any of the values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is null. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [True, False], - ... "b": [False, False], - ... "c": [None, False], - ... } - ... ) - >>> df.select(pl.col("*").any()) - shape: (1, 3) - ┌──────┬───────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪═══════╡ - │ true ┆ false ┆ false │ - └──────┴───────┴───────┘ - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> df.select(pl.col("*").any(ignore_nulls=False)) - shape: (1, 3) - ┌──────┬───────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪══════╡ - │ true ┆ false ┆ null │ - └──────┴───────┴──────┘ - - ''' - def all(self) -> Self: - ''' - Return whether all values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - .. note:: - This method is not to be confused with the function :func:`polars.all`, - which can be used to select all columns. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is null. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [True, True], - ... "b": [False, True], - ... "c": [None, True], - ... } - ... ) - >>> df.select(pl.col("*").all()) - shape: (1, 3) - ┌──────┬───────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪══════╡ - │ true ┆ false ┆ true │ - └──────┴───────┴──────┘ - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> df.select(pl.col("*").all(ignore_nulls=False)) - shape: (1, 3) - ┌──────┬───────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞══════╪═══════╪══════╡ - │ true ┆ false ┆ null │ - └──────┴───────┴──────┘ - - ''' - def arg_true(self) -> Self: - ''' - Return indices where expression evaluates `True`. - - .. warning:: - Modifies number of rows returned, so will fail in combination with other - expressions. Use as only expression in `select` / `with_columns`. - - See Also - -------- - Series.arg_true : Return indices where Series is True - polars.arg_where - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) - >>> df.select((pl.col("a") == 1).arg_true()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 0 │ - │ 1 │ - │ 3 │ - └─────┘ - - ''' - def sqrt(self) -> Self: - ''' - Compute the square root of the elements. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").sqrt()) - shape: (3, 1) - ┌──────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.0 │ - │ 1.414214 │ - │ 2.0 │ - └──────────┘ - - ''' - def cbrt(self) -> Self: - ''' - Compute the cube root of the elements. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").cbrt()) - shape: (3, 1) - ┌──────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.0 │ - │ 1.259921 │ - │ 1.587401 │ - └──────────┘ - - ''' - def log10(self) -> Self: - ''' - Compute the base 10 logarithm of the input array, element-wise. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").log10()) - shape: (3, 1) - ┌─────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞═════════╡ - │ 0.0 │ - │ 0.30103 │ - │ 0.60206 │ - └─────────┘ - - ''' - def exp(self) -> Self: - ''' - Compute the exponential, element-wise. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) - >>> df.select(pl.col("values").exp()) - shape: (3, 1) - ┌──────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 2.718282 │ - │ 7.389056 │ - │ 54.59815 │ - └──────────┘ - - ''' - def alias(self, name: str) -> Self: - ''' - Rename the expression. - - Parameters - ---------- - name - The new name. - - See Also - -------- - map - prefix - suffix - - Examples - -------- - Rename an expression to avoid overwriting an existing column. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["x", "y", "z"], - ... } - ... ) - >>> df.with_columns( - ... pl.col("a") + 10, - ... pl.col("b").str.to_uppercase().alias("c"), - ... ) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 11 ┆ x ┆ X │ - │ 12 ┆ y ┆ Y │ - │ 13 ┆ z ┆ Z │ - └─────┴─────┴─────┘ - - Overwrite the default name of literal columns to prevent errors due to duplicate - column names. - - >>> df.with_columns( - ... pl.lit(True).alias("c"), - ... pl.lit(4.0).alias("d"), - ... ) - shape: (3, 4) - ┌─────┬─────┬──────┬─────┐ - │ a ┆ b ┆ c ┆ d │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ bool ┆ f64 │ - ╞═════╪═════╪══════╪═════╡ - │ 1 ┆ x ┆ true ┆ 4.0 │ - │ 2 ┆ y ┆ true ┆ 4.0 │ - │ 3 ┆ z ┆ true ┆ 4.0 │ - └─────┴─────┴──────┴─────┘ - - ''' - def map_alias(self, function: Callable[[str], str]) -> Self: - ''' - Rename the output of an expression by mapping a function over the root name. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.map`. - - Parameters - ---------- - function - Function that maps a root name to a new name. - - See Also - -------- - keep_name - prefix - suffix - - Examples - -------- - Remove a common suffix and convert to lower case. - - >>> df = pl.DataFrame( - ... { - ... "A_reverse": [3, 2, 1], - ... "B_reverse": ["z", "y", "x"], - ... } - ... ) - >>> df.with_columns( - ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) - ... ) - shape: (3, 4) - ┌───────────┬───────────┬─────┬─────┐ - │ A_reverse ┆ B_reverse ┆ a ┆ b │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═══════════╪═══════════╪═════╪═════╡ - │ 3 ┆ z ┆ 1 ┆ x │ - │ 2 ┆ y ┆ 2 ┆ y │ - │ 1 ┆ x ┆ 3 ┆ z │ - └───────────┴───────────┴─────┴─────┘ - - ''' - def prefix(self, prefix: str) -> Self: - ''' - Add a prefix to the root column name of the expression. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.prefix`. - - Parameters - ---------- - prefix - Prefix to add to the root column name. - - Notes - ----- - This will undo any previous renaming operations on the expression. - - Due to implementation constraints, this method can only be called as the last - expression in a chain. - - See Also - -------- - suffix - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["x", "y", "z"], - ... } - ... ) - >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) - shape: (3, 4) - ┌─────┬─────┬───────────┬───────────┐ - │ a ┆ b ┆ reverse_a ┆ reverse_b │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪═════╪═══════════╪═══════════╡ - │ 1 ┆ x ┆ 3 ┆ z │ - │ 2 ┆ y ┆ 2 ┆ y │ - │ 3 ┆ z ┆ 1 ┆ x │ - └─────┴─────┴───────────┴───────────┘ - - ''' - def suffix(self, suffix: str) -> Self: - ''' - Add a suffix to the root column name of the expression. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.suffix`. - - Parameters - ---------- - suffix - Suffix to add to the root column name. - - Notes - ----- - This will undo any previous renaming operations on the expression. - - Due to implementation constraints, this method can only be called as the last - expression in a chain. - - See Also - -------- - prefix - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["x", "y", "z"], - ... } - ... ) - >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) - shape: (3, 4) - ┌─────┬─────┬───────────┬───────────┐ - │ a ┆ b ┆ a_reverse ┆ b_reverse │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪═════╪═══════════╪═══════════╡ - │ 1 ┆ x ┆ 3 ┆ z │ - │ 2 ┆ y ┆ 2 ┆ y │ - │ 3 ┆ z ┆ 1 ┆ x │ - └─────┴─────┴───────────┴───────────┘ - - ''' - def keep_name(self) -> Self: - ''' - Keep the original root name of the expression. - - .. deprecated:: 0.19.12 - This method has been renamed to :func:`name.keep`. - - Notes - ----- - Due to implementation constraints, this method can only be called as the last - expression in a chain. - - See Also - -------- - alias - - Examples - -------- - Undo an alias operation. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2], - ... "b": [3, 4], - ... } - ... ) - >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 9 ┆ 3 │ - │ 18 ┆ 4 │ - └─────┴─────┘ - - Prevent errors due to duplicate column names. - - >>> df.select((pl.lit(10) / pl.all()).name.keep()) - shape: (2, 2) - ┌──────┬──────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪══════════╡ - │ 10.0 ┆ 3.333333 │ - │ 5.0 ┆ 2.5 │ - └──────┴──────────┘ - - ''' - def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: - ''' - Exclude columns from a multi-column expression. - - Only works after a wildcard or regex column selection, and you cannot provide - both string column names *and* dtypes (you may prefer to use selectors instead). - - Parameters - ---------- - columns - The name or datatype of the column(s) to exclude. Accepts regular expression - input. Regular expressions should start with `^` and end with `$`. - *more_columns - Additional names or datatypes of columns to exclude, specified as positional - arguments. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "aa": [1, 2, 3], - ... "ba": ["a", "b", None], - ... "cc": [None, 2.5, 1.5], - ... } - ... ) - >>> df - shape: (3, 3) - ┌─────┬──────┬──────┐ - │ aa ┆ ba ┆ cc │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ f64 │ - ╞═════╪══════╪══════╡ - │ 1 ┆ a ┆ null │ - │ 2 ┆ b ┆ 2.5 │ - │ 3 ┆ null ┆ 1.5 │ - └─────┴──────┴──────┘ - - Exclude by column name(s): - - >>> df.select(pl.all().exclude("ba")) - shape: (3, 2) - ┌─────┬──────┐ - │ aa ┆ cc │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ null │ - │ 2 ┆ 2.5 │ - │ 3 ┆ 1.5 │ - └─────┴──────┘ - - Exclude by regex, e.g. removing all columns whose names end with the letter "a": - - >>> df.select(pl.all().exclude("^.*a$")) - shape: (3, 1) - ┌──────┐ - │ cc │ - │ --- │ - │ f64 │ - ╞══════╡ - │ null │ - │ 2.5 │ - │ 1.5 │ - └──────┘ - - Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: - - >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) - shape: (3, 1) - ┌──────┐ - │ ba │ - │ --- │ - │ str │ - ╞══════╡ - │ a │ - │ b │ - │ null │ - └──────┘ - - ''' - def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: - ''' - Offers a structured way to apply a sequence of user-defined functions (UDFs). - - Parameters - ---------- - function - Callable; will receive the expression as the first parameter, - followed by any given args/kwargs. - *args - Arguments to pass to the UDF. - **kwargs - Keyword arguments to pass to the UDF. - - Examples - -------- - >>> def extract_number(expr: pl.Expr) -> pl.Expr: - ... """Extract the digits from a string.""" - ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) - >>> - >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: - ... """Set even numbers negative, and scale by a user-supplied value.""" - ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) - ... return expr * n - >>> - >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) - >>> df.with_columns( - ... udfs=( - ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) - ... ), - ... ) - shape: (4, 2) - ┌──────┬──────┐ - │ val ┆ udfs │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞══════╪══════╡ - │ a: 1 ┆ 5 │ - │ b: 2 ┆ -10 │ - │ c: 3 ┆ 15 │ - │ d: 4 ┆ -20 │ - └──────┴──────┘ - - ''' - def is_not(self) -> Self: - """ - Negate a boolean expression. - - .. deprecated:: 0.19.2 - This method has been renamed to :func:`Expr.not_`. - - """ - def not_(self) -> Self: - ''' - Negate a boolean expression. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [True, False, False], - ... "b": ["a", "b", None], - ... } - ... ) - >>> df - shape: (3, 2) - ┌───────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ bool ┆ str │ - ╞═══════╪══════╡ - │ true ┆ a │ - │ false ┆ b │ - │ false ┆ null │ - └───────┴──────┘ - >>> df.select(pl.col("a").not_()) - shape: (3, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ true │ - │ true │ - └───────┘ - - ''' - def is_null(self) -> Self: - ''' - Returns a boolean Series indicating which values are null. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null - shape: (5, 4) - ┌──────┬─────┬──────────┬──────────┐ - │ a ┆ b ┆ a_isnull ┆ b_isnull │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪═════╪══════════╪══════════╡ - │ 1 ┆ 1.0 ┆ false ┆ false │ - │ 2 ┆ 2.0 ┆ false ┆ false │ - │ null ┆ NaN ┆ true ┆ false │ - │ 1 ┆ 1.0 ┆ false ┆ false │ - │ 5 ┆ 5.0 ┆ false ┆ false │ - └──────┴─────┴──────────┴──────────┘ - - ''' - def is_not_null(self) -> Self: - ''' - Returns a boolean Series indicating which values are not null. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns( - ... pl.all().is_not_null().name.suffix("_not_null") # nan != null - ... ) - shape: (5, 4) - ┌──────┬─────┬────────────┬────────────┐ - │ a ┆ b ┆ a_not_null ┆ b_not_null │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪═════╪════════════╪════════════╡ - │ 1 ┆ 1.0 ┆ true ┆ true │ - │ 2 ┆ 2.0 ┆ true ┆ true │ - │ null ┆ NaN ┆ false ┆ true │ - │ 1 ┆ 1.0 ┆ true ┆ true │ - │ 5 ┆ 5.0 ┆ true ┆ true │ - └──────┴─────┴────────────┴────────────┘ - - ''' - def is_finite(self) -> Self: - ''' - Returns a boolean Series indicating which values are finite. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1.0, 2], - ... "B": [3.0, float("inf")], - ... } - ... ) - >>> df.select(pl.all().is_finite()) - shape: (2, 2) - ┌──────┬───────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ bool ┆ bool │ - ╞══════╪═══════╡ - │ true ┆ true │ - │ true ┆ false │ - └──────┴───────┘ - - ''' - def is_infinite(self) -> Self: - ''' - Returns a boolean Series indicating which values are infinite. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1.0, 2], - ... "B": [3.0, float("inf")], - ... } - ... ) - >>> df.select(pl.all().is_infinite()) - shape: (2, 2) - ┌───────┬───────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ bool ┆ bool │ - ╞═══════╪═══════╡ - │ false ┆ false │ - │ false ┆ true │ - └───────┴───────┘ - - ''' - def is_nan(self) -> Self: - ''' - Returns a boolean Series indicating which values are NaN. - - Notes - ----- - Floating point `NaN` (Not A Number) should not be confused - with missing data represented as `Null/None`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) - shape: (5, 3) - ┌──────┬─────┬─────────┐ - │ a ┆ b ┆ b_isnan │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪═════╪═════════╡ - │ 1 ┆ 1.0 ┆ false │ - │ 2 ┆ 2.0 ┆ false │ - │ null ┆ NaN ┆ true │ - │ 1 ┆ 1.0 ┆ false │ - │ 5 ┆ 5.0 ┆ false │ - └──────┴─────┴─────────┘ - - ''' - def is_not_nan(self) -> Self: - ''' - Returns a boolean Series indicating which values are not NaN. - - Notes - ----- - Floating point `NaN` (Not A Number) should not be confused - with missing data represented as `Null/None`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None, 1, 5], - ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], - ... } - ... ) - >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) - shape: (5, 3) - ┌──────┬─────┬──────────────┐ - │ a ┆ b ┆ b_is_not_nan │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪═════╪══════════════╡ - │ 1 ┆ 1.0 ┆ true │ - │ 2 ┆ 2.0 ┆ true │ - │ null ┆ NaN ┆ false │ - │ 1 ┆ 1.0 ┆ true │ - │ 5 ┆ 5.0 ┆ true │ - └──────┴─────┴──────────────┘ - - ''' - def agg_groups(self) -> Self: - ''' - Get the group indexes of the group by operation. - - Should be used in aggregation context only. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": [ - ... "one", - ... "one", - ... "one", - ... "two", - ... "two", - ... "two", - ... ], - ... "value": [94, 95, 96, 97, 97, 99], - ... } - ... ) - >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ list[u32] │ - ╞═══════╪═══════════╡ - │ one ┆ [0, 1, 2] │ - │ two ┆ [3, 4, 5] │ - └───────┴───────────┘ - - ''' - def count(self) -> Self: - ''' - Return the number of elements in the column. - - .. warning:: - Null values are treated like regular elements in this context. - - Examples - -------- - >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) - >>> df.select(pl.all().count()) - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 3 ┆ 3 │ - └─────┴─────┘ - - ''' - def len(self) -> Self: - ''' - Return the number of elements in the column. - - Null values are treated like regular elements in this context. - - Alias for :func:`count`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) - >>> df.select(pl.all().len()) - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 3 ┆ 3 │ - └─────┴─────┘ - - ''' - def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: - ''' - Get a slice of this expression. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [8, 9, 10, 11], - ... "b": [None, 4, 4, 4], - ... } - ... ) - >>> df.select(pl.all().slice(1, 2)) - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 9 ┆ 4 │ - │ 10 ┆ 4 │ - └─────┴─────┘ - - ''' - def append(self, other: IntoExpr) -> Self: - ''' - Append expressions. - - This is done by adding the chunks of `other` to this `Series`. - - Parameters - ---------- - other - Expression to append. - upcast - Cast both `Series` to the same supertype. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [8, 9, 10], - ... "b": [None, 4, 4], - ... } - ... ) - >>> df.select(pl.all().head(1).append(pl.all().tail(1))) - shape: (2, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 8 ┆ null │ - │ 10 ┆ 4 │ - └─────┴──────┘ - - ''' - def rechunk(self) -> Self: - ''' - Create a single chunk of memory for this Series. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - - Create a Series with 3 nulls, append column a then rechunk - - >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) - shape: (6, 1) - ┌────────┐ - │ repeat │ - │ --- │ - │ i64 │ - ╞════════╡ - │ null │ - │ null │ - │ null │ - │ 1 │ - │ 1 │ - │ 2 │ - └────────┘ - - ''' - def drop_nulls(self) -> Self: - ''' - Drop all null values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nans - - Notes - ----- - A null value is not the same as a NaN value. - To drop NaN values, use :func:`drop_nans`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) - >>> df.select(pl.col("a").drop_nulls()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - │ 3.0 │ - │ NaN │ - └─────┘ - - ''' - def drop_nans(self) -> Self: - ''' - Drop all floating point NaN values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nulls - - Notes - ----- - A NaN value is not the same as a null value. - To drop null values, use :func:`drop_nulls`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) - >>> df.select(pl.col("a").drop_nans()) - shape: (3, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ 1.0 │ - │ null │ - │ 3.0 │ - └──────┘ - - ''' - def cum_sum(self) -> Self: - ''' - Get an array with the cumulative sum computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_sum().alias("cum_sum"), - ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), - ... ) - shape: (4, 3) - ┌─────┬─────────┬─────────────────┐ - │ a ┆ cum_sum ┆ cum_sum_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════════╪═════════════════╡ - │ 1 ┆ 1 ┆ 10 │ - │ 2 ┆ 3 ┆ 9 │ - │ 3 ┆ 6 ┆ 7 │ - │ 4 ┆ 10 ┆ 4 │ - └─────┴─────────┴─────────────────┘ - - Null values are excluded, but can also be filled by calling `forward_fill`. - - >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) - >>> df.with_columns( - ... pl.col("values").cum_sum().alias("value_cum_sum"), - ... pl.col("values") - ... .cum_sum() - ... .forward_fill() - ... .alias("value_cum_sum_all_filled"), - ... ) - shape: (8, 3) - ┌────────┬───────────────┬──────────────────────────┐ - │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞════════╪═══════════════╪══════════════════════════╡ - │ null ┆ null ┆ null │ - │ 10 ┆ 10 ┆ 10 │ - │ null ┆ null ┆ 10 │ - │ 8 ┆ 18 ┆ 18 │ - │ 9 ┆ 27 ┆ 27 │ - │ null ┆ null ┆ 27 │ - │ 16 ┆ 43 ┆ 43 │ - │ null ┆ null ┆ 43 │ - └────────┴───────────────┴──────────────────────────┘ - - ''' - def cum_prod(self) -> Self: - ''' - Get an array with the cumulative product computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_prod().alias("cum_prod"), - ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), - ... ) - shape: (4, 3) - ┌─────┬──────────┬──────────────────┐ - │ a ┆ cum_prod ┆ cum_prod_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪══════════╪══════════════════╡ - │ 1 ┆ 1 ┆ 24 │ - │ 2 ┆ 2 ┆ 24 │ - │ 3 ┆ 6 ┆ 12 │ - │ 4 ┆ 24 ┆ 4 │ - └─────┴──────────┴──────────────────┘ - - ''' - def cum_min(self) -> Self: - ''' - Get an array with the cumulative min computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_min().alias("cum_min"), - ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), - ... ) - shape: (4, 3) - ┌─────┬─────────┬─────────────────┐ - │ a ┆ cum_min ┆ cum_min_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════════╪═════════════════╡ - │ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 1 ┆ 2 │ - │ 3 ┆ 1 ┆ 3 │ - │ 4 ┆ 1 ┆ 4 │ - └─────┴─────────┴─────────────────┘ - - ''' - def cum_max(self) -> Self: - ''' - Get an array with the cumulative max computed at every element. - - Parameters - ---------- - reverse - Reverse the operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_max().alias("cum_max"), - ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), - ... ) - shape: (4, 3) - ┌─────┬─────────┬─────────────────┐ - │ a ┆ cum_max ┆ cum_max_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════════╪═════════════════╡ - │ 1 ┆ 1 ┆ 4 │ - │ 2 ┆ 2 ┆ 4 │ - │ 3 ┆ 3 ┆ 4 │ - │ 4 ┆ 4 ┆ 4 │ - └─────┴─────────┴─────────────────┘ - - Null values are excluded, but can also be filled by calling `forward_fill`. - - >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) - >>> df.with_columns( - ... pl.col("values").cum_max().alias("cum_max"), - ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), - ... ) - shape: (8, 3) - ┌────────┬─────────┬────────────────────┐ - │ values ┆ cum_max ┆ cum_max_all_filled │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞════════╪═════════╪════════════════════╡ - │ null ┆ null ┆ null │ - │ 10 ┆ 10 ┆ 10 │ - │ null ┆ null ┆ 10 │ - │ 8 ┆ 10 ┆ 10 │ - │ 9 ┆ 10 ┆ 10 │ - │ null ┆ null ┆ 10 │ - │ 16 ┆ 16 ┆ 16 │ - │ null ┆ null ┆ 16 │ - └────────┴─────────┴────────────────────┘ - - ''' - def cum_count(self) -> Self: - ''' - Get an array with the cumulative count computed at every element. - - Counting from 0 to len - - Parameters - ---------- - reverse - Reverse the operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("a").cum_count().alias("cum_count"), - ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), - ... ) - shape: (4, 3) - ┌─────┬───────────┬───────────────────┐ - │ a ┆ cum_count ┆ cum_count_reverse │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ u32 ┆ u32 │ - ╞═════╪═══════════╪═══════════════════╡ - │ 1 ┆ 0 ┆ 3 │ - │ 2 ┆ 1 ┆ 2 │ - │ 3 ┆ 2 ┆ 1 │ - │ 4 ┆ 3 ┆ 0 │ - └─────┴───────────┴───────────────────┘ - - ''' - def floor(self) -> Self: - ''' - Rounds down to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) - >>> df.select(pl.col("a").floor()) - shape: (4, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - │ 0.0 │ - │ 1.0 │ - │ 1.0 │ - └─────┘ - - ''' - def ceil(self) -> Self: - ''' - Rounds up to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) - >>> df.select(pl.col("a").ceil()) - shape: (4, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - │ 1.0 │ - │ 1.0 │ - │ 2.0 │ - └─────┘ - - ''' - def round(self, decimals: int = ...) -> Self: - ''' - Round underlying floating point data by `decimals` digits. - - Parameters - ---------- - decimals - Number of decimals to round by. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) - >>> df.select(pl.col("a").round(1)) - shape: (4, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.3 │ - │ 0.5 │ - │ 1.0 │ - │ 1.2 │ - └─────┘ - - ''' - def round_sig_figs(self, digits: int) -> Self: - ''' - Round to a number of significant figures. - - Parameters - ---------- - digits - Number of significant figures to round to. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) - >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) - shape: (3, 2) - ┌─────────┬────────────────┐ - │ a ┆ round_sig_figs │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════════╪════════════════╡ - │ 0.01234 ┆ 0.012 │ - │ 3.333 ┆ 3.3 │ - │ 1234.0 ┆ 1200.0 │ - └─────────┴────────────────┘ - - ''' - def dot(self, other: Expr | str) -> Self: - ''' - Compute the dot/inner product between two Expressions. - - Parameters - ---------- - other - Expression to compute dot product with. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> df.select(pl.col("a").dot(pl.col("b"))) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 44 │ - └─────┘ - - ''' - def mode(self) -> Self: - ''' - Compute the most occurring value(s). - - Can return multiple Values. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 1, 2, 3], - ... "b": [1, 1, 2, 2], - ... } - ... ) - >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 1 │ - │ 1 ┆ 2 │ - └─────┴─────┘ - - ''' - def cast(self, dtype: PolarsDataType | type[Any]) -> Self: - ''' - Cast between data types. - - Parameters - ---------- - dtype - DataType to cast to. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": ["4", "5", "6"], - ... } - ... ) - >>> df.with_columns( - ... [ - ... pl.col("a").cast(pl.Float64), - ... pl.col("b").cast(pl.Int32), - ... ] - ... ) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ i32 │ - ╞═════╪═════╡ - │ 1.0 ┆ 4 │ - │ 2.0 ┆ 5 │ - │ 3.0 ┆ 6 │ - └─────┴─────┘ - - ''' - def sort(self) -> Self: - ''' - Sort this column. - - When used in a projection/selection context, the whole column is sorted. - When used in a group by context, the groups are sorted. - - Parameters - ---------- - descending - Sort in descending order. - nulls_last - Place null values last. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, None, 3, 2], - ... } - ... ) - >>> df.select(pl.col("a").sort()) - shape: (4, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ null │ - │ 1 │ - │ 2 │ - │ 3 │ - └──────┘ - >>> df.select(pl.col("a").sort(descending=True)) - shape: (4, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ null │ - │ 3 │ - │ 2 │ - │ 1 │ - └──────┘ - >>> df.select(pl.col("a").sort(nulls_last=True)) - shape: (4, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ 1 │ - │ 2 │ - │ 3 │ - │ null │ - └──────┘ - - When sorting in a group by context, the groups are sorted. - - >>> df = pl.DataFrame( - ... { - ... "group": ["one", "one", "one", "two", "two", "two"], - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT - shape: (2, 2) - ┌───────┬────────────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪════════════╡ - │ two ┆ [3, 4, 99] │ - │ one ┆ [1, 2, 98] │ - └───────┴────────────┘ - - ''' - def top_k(self, k: int | IntoExprColumn = ...) -> Self: - ''' - Return the `k` largest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - bottom_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("value").top_k().alias("top_k"), - ... pl.col("value").bottom_k().alias("bottom_k"), - ... ] - ... ) - shape: (5, 2) - ┌───────┬──────────┐ - │ top_k ┆ bottom_k │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═══════╪══════════╡ - │ 99 ┆ 1 │ - │ 98 ┆ 2 │ - │ 4 ┆ 3 │ - │ 3 ┆ 4 │ - │ 2 ┆ 98 │ - └───────┴──────────┘ - - ''' - def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: - ''' - Return the `k` smallest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - top_k - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("value").top_k().alias("top_k"), - ... pl.col("value").bottom_k().alias("bottom_k"), - ... ] - ... ) - shape: (5, 2) - ┌───────┬──────────┐ - │ top_k ┆ bottom_k │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═══════╪══════════╡ - │ 99 ┆ 1 │ - │ 98 ┆ 2 │ - │ 4 ┆ 3 │ - │ 3 ┆ 4 │ - │ 2 ┆ 98 │ - └───────┴──────────┘ - - ''' - def arg_sort(self) -> Self: - ''' - Get the index values that would sort this column. - - Parameters - ---------- - descending - Sort in descending (descending) order. - nulls_last - Place null values last instead of first. - - Returns - ------- - Expr - Expression of data type :class:`UInt32`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [20, 10, 30], - ... } - ... ) - >>> df.select(pl.col("a").arg_sort()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 1 │ - │ 0 │ - │ 2 │ - └─────┘ - - ''' - def arg_max(self) -> Self: - ''' - Get the index of the maximal value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [20, 10, 30], - ... } - ... ) - >>> df.select(pl.col("a").arg_max()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def arg_min(self) -> Self: - ''' - Get the index of the minimal value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [20, 10, 30], - ... } - ... ) - >>> df.select(pl.col("a").arg_min()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 1 │ - └─────┘ - - ''' - def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: - ''' - Find indices where elements should be inserted to maintain order. - - .. math:: a[i-1] < v <= a[i] - - Parameters - ---------- - element - Expression or scalar value. - side : {\'any\', \'left\', \'right\'} - If \'any\', the index of the first suitable location found is given. - If \'left\', the index of the leftmost suitable location found is given. - If \'right\', return the rightmost suitable location found is given. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "values": [1, 2, 3, 5], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("values").search_sorted(0).alias("zero"), - ... pl.col("values").search_sorted(3).alias("three"), - ... pl.col("values").search_sorted(6).alias("six"), - ... ] - ... ) - shape: (1, 3) - ┌──────┬───────┬─────┐ - │ zero ┆ three ┆ six │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ u32 │ - ╞══════╪═══════╪═════╡ - │ 0 ┆ 2 ┆ 4 │ - └──────┴───────┴─────┘ - - ''' - def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: - ''' - Sort this column by the ordering of other columns. - - When used in a projection/selection context, the whole column is sorted. - When used in a group by context, the groups are sorted. - - Parameters - ---------- - by - Column(s) to sort by. Accepts expression input. Strings are parsed as column - names. - *more_by - Additional columns to sort by, specified as positional arguments. - descending - Sort in descending order. When sorting by multiple columns, can be specified - per column by passing a sequence of booleans. - - Examples - -------- - Pass a single column name to sort by that column. - - >>> df = pl.DataFrame( - ... { - ... "group": ["a", "a", "b", "b"], - ... "value1": [1, 3, 4, 2], - ... "value2": [8, 7, 6, 5], - ... } - ... ) - >>> df.select(pl.col("group").sort_by("value1")) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ a │ - │ b │ - │ a │ - │ b │ - └───────┘ - - Sorting by expressions is also supported. - - >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ b │ - │ a │ - │ a │ - │ b │ - └───────┘ - - Sort by multiple columns by passing a list of columns. - - >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ b │ - │ a │ - │ b │ - │ a │ - └───────┘ - - Or use positional arguments to sort by multiple columns in the same way. - - >>> df.select(pl.col("group").sort_by("value1", "value2")) - shape: (4, 1) - ┌───────┐ - │ group │ - │ --- │ - │ str │ - ╞═══════╡ - │ a │ - │ b │ - │ a │ - │ b │ - └───────┘ - - When sorting in a group by context, the groups are sorted. - - >>> df.group_by("group").agg( - ... pl.col("value1").sort_by("value2") - ... ) # doctest: +IGNORE_RESULT - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ value1 │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪═══════════╡ - │ a ┆ [3, 1] │ - │ b ┆ [2, 4] │ - └───────┴───────────┘ - - Take a single row from each group where a column attains its minimal value - within that group. - - >>> df.group_by("group").agg( - ... pl.all().sort_by("value2").first() - ... ) # doctest: +IGNORE_RESULT - shape: (2, 3) - ┌───────┬────────┬────────┐ - │ group ┆ value1 ┆ value2 | - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 | - ╞═══════╪════════╪════════╡ - │ a ┆ 3 ┆ 7 | - │ b ┆ 2 ┆ 5 | - └───────┴────────┴────────┘ - - ''' - def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: - ''' - Take values by index. - - Parameters - ---------- - indices - An expression that leads to a UInt32 dtyped Series. - - Returns - ------- - Expr - Expression of the same data type. - - See Also - -------- - Expr.get : Take a single value - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": [ - ... "one", - ... "one", - ... "one", - ... "two", - ... "two", - ... "two", - ... ], - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.group_by("group", maintain_order=True).agg( - ... pl.col("value").gather([2, 1]) - ... ) - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪═══════════╡ - │ one ┆ [2, 98] │ - │ two ┆ [4, 99] │ - └───────┴───────────┘ - ''' - def get(self, index: int | Expr) -> Self: - ''' - Return a single value by index. - - Parameters - ---------- - index - An expression that leads to a UInt32 index. - - Returns - ------- - Expr - Expression of the same data type. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": [ - ... "one", - ... "one", - ... "one", - ... "two", - ... "two", - ... "two", - ... ], - ... "value": [1, 98, 2, 3, 99, 4], - ... } - ... ) - >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) - shape: (2, 2) - ┌───────┬───────┐ - │ group ┆ value │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═══════╪═══════╡ - │ one ┆ 98 │ - │ two ┆ 99 │ - └───────┴───────┘ - - ''' - def shift(self, n: int | IntoExprColumn = ...) -> Self: - ''' - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) - >>> df.with_columns(shift=pl.col("a").shift()) - shape: (4, 2) - ┌─────┬───────┐ - │ a ┆ shift │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═══════╡ - │ 1 ┆ null │ - │ 2 ┆ 1 │ - │ 3 ┆ 2 │ - │ 4 ┆ 3 │ - └─────┴───────┘ - - Pass a negative value to shift in the opposite direction instead. - - >>> df.with_columns(shift=pl.col("a").shift(-2)) - shape: (4, 2) - ┌─────┬───────┐ - │ a ┆ shift │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═══════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - │ 3 ┆ null │ - │ 4 ┆ null │ - └─────┴───────┘ - - Specify `fill_value` to fill the resulting null values. - - >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) - shape: (4, 2) - ┌─────┬───────┐ - │ a ┆ shift │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═══════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - │ 3 ┆ 100 │ - │ 4 ┆ 100 │ - └─────┴───────┘ - - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: - ''' - Fill null values using the specified value or strategy. - - To interpolate over null values see interpolate. - See the examples below to fill nulls with an expression. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [4, None, 6], - ... } - ... ) - >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 0 │ - │ null ┆ 6 │ - └──────┴─────┘ - >>> df.with_columns(pl.col("b").fill_null(99)) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 99 │ - │ null ┆ 6 │ - └──────┴─────┘ - >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 4 │ - │ null ┆ 6 │ - └──────┴─────┘ - >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞══════╪═════╡ - │ 1 ┆ 4.0 │ - │ 2 ┆ 5.0 │ - │ null ┆ 6.0 │ - └──────┴─────┘ - >>> df.with_columns(pl.all().fill_null(pl.all().median())) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 1.0 ┆ 4.0 │ - │ 2.0 ┆ 5.0 │ - │ 1.5 ┆ 6.0 │ - └─────┴─────┘ - - ''' - def fill_nan(self, value: int | float | Expr | None) -> Self: - ''' - Fill floating point NaN value with a fill value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1.0, None, float("nan")], - ... "b": [4.0, float("nan"), 6], - ... } - ... ) - >>> df.with_columns(pl.col("b").fill_nan(0)) - shape: (3, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪═════╡ - │ 1.0 ┆ 4.0 │ - │ null ┆ 0.0 │ - │ NaN ┆ 6.0 │ - └──────┴─────┘ - - ''' - def forward_fill(self, limit: int | None = ...) -> Self: - ''' - Fill missing values with the latest seen values. - - Parameters - ---------- - limit - The number of consecutive null values to forward fill. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [4, None, 6], - ... } - ... ) - >>> df.select(pl.all().forward_fill()) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 4 │ - │ 2 ┆ 6 │ - └─────┴─────┘ - - ''' - def backward_fill(self, limit: int | None = ...) -> Self: - ''' - Fill missing values with the next to be seen values. - - Parameters - ---------- - limit - The number of consecutive null values to backward fill. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": [4, None, 6], - ... "c": [None, None, 2], - ... } - ... ) - >>> df.select(pl.all().backward_fill()) - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞══════╪═════╪═════╡ - │ 1 ┆ 4 ┆ 2 │ - │ 2 ┆ 6 ┆ 2 │ - │ null ┆ 6 ┆ 2 │ - └──────┴─────┴─────┘ - >>> df.select(pl.all().backward_fill(limit=1)) - shape: (3, 3) - ┌──────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞══════╪═════╪══════╡ - │ 1 ┆ 4 ┆ null │ - │ 2 ┆ 6 ┆ 2 │ - │ null ┆ 6 ┆ 2 │ - └──────┴─────┴──────┘ - - ''' - def reverse(self) -> Self: - ''' - Reverse the selection. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [1, 2, 3, 4, 5], - ... "fruits": ["banana", "banana", "apple", "apple", "banana"], - ... "B": [5, 4, 3, 2, 1], - ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], - ... } - ... ) - >>> df.select( - ... [ - ... pl.all(), - ... pl.all().reverse().name.suffix("_reverse"), - ... ] - ... ) - shape: (5, 8) - ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ - │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ - ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ - │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ - │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ - │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ - │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ - │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ - └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ - - ''' - def std(self, ddof: int = ...) -> Self: - ''' - Get standard deviation. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").std()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def var(self, ddof: int = ...) -> Self: - ''' - Get variance. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").var()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def max(self) -> Self: - ''' - Get maximum value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) - >>> df.select(pl.col("a").max()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def min(self) -> Self: - ''' - Get minimum value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) - >>> df.select(pl.col("a").min()) - shape: (1, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ -1.0 │ - └──────┘ - - ''' - def nan_max(self) -> Self: - ''' - Get maximum value, but propagate/poison encountered NaN values. - - This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0, float("nan")]}) - >>> df.select(pl.col("a").nan_max()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ NaN │ - └─────┘ - - ''' - def nan_min(self) -> Self: - ''' - Get minimum value, but propagate/poison encountered NaN values. - - This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0, float("nan")]}) - >>> df.select(pl.col("a").nan_min()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ NaN │ - └─────┘ - - ''' - def sum(self) -> Self: - ''' - Get sum value. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").sum()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 0 │ - └─────┘ - - ''' - def mean(self) -> Self: - ''' - Get mean value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").mean()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def median(self) -> Self: - ''' - Get median value using linear interpolation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-1, 0, 1]}) - >>> df.select(pl.col("a").median()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def product(self) -> Self: - ''' - Compute the product of an expression. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").product()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 6 │ - └─────┘ - - ''' - def n_unique(self) -> Self: - ''' - Count unique values. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").n_unique()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def approx_n_unique(self) -> Self: - ''' - Approximate count of unique values. - - This is done using the HyperLogLog++ algorithm for cardinality estimation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").approx_n_unique()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def null_count(self) -> Self: - ''' - Count null values. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [None, 1, None], - ... "b": [1, 2, 3], - ... } - ... ) - >>> df.select(pl.all().null_count()) - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 2 ┆ 0 │ - └─────┴─────┘ - - ''' - def arg_unique(self) -> Self: - ''' - Get index of first unique value. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [8, 9, 10], - ... "b": [None, 4, 4], - ... } - ... ) - >>> df.select(pl.col("a").arg_unique()) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 0 │ - │ 1 │ - │ 2 │ - └─────┘ - >>> df.select(pl.col("b").arg_unique()) - shape: (2, 1) - ┌─────┐ - │ b │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 0 │ - │ 1 │ - └─────┘ - - ''' - def unique(self) -> Self: - ''' - Get unique values of this expression. - - Parameters - ---------- - maintain_order - Maintain order of data. This requires more work. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - │ 1 │ - └─────┘ - >>> df.select(pl.col("a").unique(maintain_order=True)) - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - └─────┘ - - ''' - def first(self) -> Self: - ''' - Get the first value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").first()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - └─────┘ - - ''' - def last(self) -> Self: - ''' - Get the last value. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").last()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - └─────┘ - - ''' - def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: - ''' - Compute expressions over the given groups. - - This expression is similar to performing a group by aggregation and joining the - result back into the original DataFrame. - - The outcome is similar to how `window functions - `_ - work in PostgreSQL. - - Parameters - ---------- - expr - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_exprs - Additional columns to group by, specified as positional arguments. - mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} - - group_to_rows - If the aggregation results in multiple values, assign them back to their - position in the DataFrame. This can only be done if the group yields - the same elements before aggregation as after. - - join - Join the groups as \'List\' to the row positions. - warning: this can be memory intensive. - - explode - Don\'t do any mapping, but simply flatten the group. - This only makes sense if the input data is sorted. - - Examples - -------- - Pass the name of a column to compute the expression over that column. - - >>> df = pl.DataFrame( - ... { - ... "a": ["a", "a", "b", "b", "b"], - ... "b": [1, 2, 3, 5, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> df.with_columns( - ... pl.col("c").max().over("a").name.suffix("_max"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_max │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 5 │ - │ b ┆ 3 ┆ 3 ┆ 3 │ - │ b ┆ 5 ┆ 2 ┆ 3 │ - │ b ┆ 3 ┆ 1 ┆ 3 │ - └─────┴─────┴─────┴───────┘ - - Expression input is supported. - - >>> df.with_columns( - ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_max │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 4 │ - │ b ┆ 3 ┆ 3 ┆ 4 │ - │ b ┆ 5 ┆ 2 ┆ 2 │ - │ b ┆ 3 ┆ 1 ┆ 4 │ - └─────┴─────┴─────┴───────┘ - - Group by multiple columns by passing a list of column names or expressions. - - >>> df.with_columns( - ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_min │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 4 │ - │ b ┆ 3 ┆ 3 ┆ 1 │ - │ b ┆ 5 ┆ 2 ┆ 2 │ - │ b ┆ 3 ┆ 1 ┆ 1 │ - └─────┴─────┴─────┴───────┘ - - Or use positional arguments to group by multiple columns in the same way. - - >>> df.with_columns( - ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_min │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 4 │ - │ b ┆ 3 ┆ 3 ┆ 1 │ - │ b ┆ 5 ┆ 2 ┆ 1 │ - │ b ┆ 3 ┆ 1 ┆ 1 │ - └─────┴─────┴─────┴───────┘ - - ''' - def rolling(self, index_column: str) -> Self: - ''' - Create rolling groups based on a time, Int32, or Int64 column. - - If you have a time series ``, then by default the - windows created will be - - * (t_0 - period, t_0] - * (t_1 - period, t_1] - * ... - * (t_n - period, t_n] - - whereas if you pass a non-default `offset`, then the windows will be - - * (t_0 + offset, t_0 + offset + period] - * (t_1 + offset, t_1 + offset + period] - * ... - * (t_n + offset, t_n + offset + period] - - The `period` and `offset` arguments are created either from a timedelta, or - by using the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a rolling operation on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order. - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Examples - -------- - >>> dates = [ - ... "2020-01-01 13:45:48", - ... "2020-01-01 16:42:13", - ... "2020-01-01 16:45:09", - ... "2020-01-02 18:12:48", - ... "2020-01-03 19:45:32", - ... "2020-01-08 23:16:43", - ... ] - >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( - ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() - ... ) - >>> df.with_columns( - ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), - ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), - ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), - ... ) - shape: (6, 5) - ┌─────────────────────┬─────┬───────┬───────┬───────┐ - │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ - │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ - │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ - │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ - │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ - │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ - │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ - └─────────────────────┴─────┴───────┴───────┴───────┘ - - ''' - def is_unique(self) -> Self: - ''' - Get mask of unique values. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").is_unique()) - shape: (3, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ false │ - │ true │ - └───────┘ - - ''' - def is_first_distinct(self) -> Self: - ''' - Return a boolean mask indicating the first occurrence of each distinct value. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) - >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) - shape: (5, 2) - ┌─────┬───────┐ - │ a ┆ first │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪═══════╡ - │ 1 ┆ true │ - │ 1 ┆ false │ - │ 2 ┆ true │ - │ 3 ┆ true │ - │ 2 ┆ false │ - └─────┴───────┘ - - ''' - def is_last_distinct(self) -> Self: - ''' - Return a boolean mask indicating the last occurrence of each distinct value. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) - >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) - shape: (5, 2) - ┌─────┬───────┐ - │ a ┆ last │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪═══════╡ - │ 1 ┆ false │ - │ 1 ┆ true │ - │ 2 ┆ false │ - │ 3 ┆ true │ - │ 2 ┆ true │ - └─────┴───────┘ - - ''' - def is_duplicated(self) -> Self: - ''' - Return a boolean mask indicating duplicated values. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 1, 2]}) - >>> df.select(pl.col("a").is_duplicated()) - shape: (3, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ true │ - │ true │ - │ false │ - └───────┘ - - ''' - def peak_max(self) -> Self: - ''' - Get a boolean mask of the local maximum peaks. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) - >>> df.select(pl.col("a").peak_max()) - shape: (5, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ false │ - │ false │ - │ false │ - │ true │ - └───────┘ - - ''' - def peak_min(self) -> Self: - ''' - Get a boolean mask of the local minimum peaks. - - Examples - -------- - >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) - >>> df.select(pl.col("a").peak_min()) - shape: (5, 1) - ┌───────┐ - │ a │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ true │ - │ false │ - │ true │ - │ false │ - └───────┘ - - ''' - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: - ''' - Get quantile value. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) - >>> df.select(pl.col("a").quantile(0.3)) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 2.0 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.5 │ - └─────┘ - >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.5 │ - └─────┘ - - ''' - def cut(self, breaks: Sequence[float]) -> Self: - ''' - Bin continuous values into discrete categories. - - Parameters - ---------- - breaks - List of unique cut points. - labels - Names of the categories. The number of labels must be equal to the number - of cut points plus one. - left_closed - Set the intervals to be left-closed instead of right-closed. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - - Returns - ------- - Expr - Expression of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise an expression of data type :class:`Struct`. - - See Also - -------- - qcut - - Examples - -------- - Divide a column into three categories. - - >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) - >>> df.with_columns( - ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") - ... ) - shape: (5, 2) - ┌─────┬─────┐ - │ foo ┆ cut │ - │ --- ┆ --- │ - │ i64 ┆ cat │ - ╞═════╪═════╡ - │ -2 ┆ a │ - │ -1 ┆ a │ - │ 0 ┆ b │ - │ 1 ┆ b │ - │ 2 ┆ c │ - └─────┴─────┘ - - Add both the category and the breakpoint. - - >>> df.with_columns( - ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") - ... ).unnest("cut") - shape: (5, 3) - ┌─────┬──────┬────────────┐ - │ foo ┆ brk ┆ foo_bin │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪══════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴──────┴────────────┘ - - ''' - def qcut(self, quantiles: Sequence[float] | int) -> Self: - ''' - Bin continuous values into discrete categories based on their quantiles. - - Parameters - ---------- - quantiles - Either a list of quantile probabilities between 0 and 1 or a positive - integer determining the number of bins with uniform probability. - labels - Names of the categories. The number of labels must be equal to the number - of categories. - left_closed - Set the intervals to be left-closed instead of right-closed. - allow_duplicates - If set to `True`, duplicates in the resulting quantiles are dropped, - rather than raising a `DuplicateError`. This can happen even with unique - probabilities, depending on the data. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - - Returns - ------- - Expr - Expression of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise an expression of data type :class:`Struct`. - - See Also - -------- - cut - - Examples - -------- - Divide a column into three categories according to pre-defined quantile - probabilities. - - >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) - >>> df.with_columns( - ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") - ... ) - shape: (5, 2) - ┌─────┬──────┐ - │ foo ┆ qcut │ - │ --- ┆ --- │ - │ i64 ┆ cat │ - ╞═════╪══════╡ - │ -2 ┆ a │ - │ -1 ┆ a │ - │ 0 ┆ b │ - │ 1 ┆ b │ - │ 2 ┆ c │ - └─────┴──────┘ - - Divide a column into two categories using uniform quantile probabilities. - - >>> df.with_columns( - ... pl.col("foo") - ... .qcut(2, labels=["low", "high"], left_closed=True) - ... .alias("qcut") - ... ) - shape: (5, 2) - ┌─────┬──────┐ - │ foo ┆ qcut │ - │ --- ┆ --- │ - │ i64 ┆ cat │ - ╞═════╪══════╡ - │ -2 ┆ low │ - │ -1 ┆ low │ - │ 0 ┆ high │ - │ 1 ┆ high │ - │ 2 ┆ high │ - └─────┴──────┘ - - Add both the category and the breakpoint. - - >>> df.with_columns( - ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") - ... ).unnest("qcut") - shape: (5, 3) - ┌─────┬──────┬────────────┐ - │ foo ┆ brk ┆ foo_bin │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪══════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴──────┴────────────┘ - - ''' - def rle(self) -> Self: - ''' - Get the lengths of runs of identical values. - - Returns - ------- - Expr - Expression of data type :class:`Struct` with Fields "lengths" and "values". - - Examples - -------- - >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) - >>> df.select(pl.col("s").rle()).unnest("s") - shape: (6, 2) - ┌─────────┬────────┐ - │ lengths ┆ values │ - │ --- ┆ --- │ - │ i32 ┆ i64 │ - ╞═════════╪════════╡ - │ 2 ┆ 1 │ - │ 1 ┆ 2 │ - │ 1 ┆ 1 │ - │ 1 ┆ null │ - │ 1 ┆ 1 │ - │ 2 ┆ 3 │ - └─────────┴────────┘ - ''' - def rle_id(self) -> Self: - ''' - Map values to run IDs. - - Similar to RLE, but it maps each value to an ID corresponding to the run into - which it falls. This is especially useful when you want to define groups by - runs of identical values rather than the values themselves. - - - Examples - -------- - >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) - >>> # It works on structs of multiple values too! - >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) - shape: (5, 4) - ┌─────┬──────┬─────┬──────┐ - │ a ┆ b ┆ a_r ┆ ab_r │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ u32 ┆ u32 │ - ╞═════╪══════╪═════╪══════╡ - │ 1 ┆ x ┆ 0 ┆ 0 │ - │ 2 ┆ x ┆ 1 ┆ 1 │ - │ 1 ┆ null ┆ 2 ┆ 2 │ - │ 1 ┆ y ┆ 2 ┆ 3 │ - │ 1 ┆ y ┆ 2 ┆ 3 │ - └─────┴──────┴─────┴──────┘ - ''' - def filter(self, predicate: Expr) -> Self: - ''' - Filter a single column. - - The original order of the remaining elements is preserved. - - Mostly useful in an aggregation context. If you want to filter on a DataFrame - level, use `LazyFrame.filter`. - - Parameters - ---------- - predicate - Boolean expression. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group_col": ["g1", "g1", "g2"], - ... "b": [1, 2, 3], - ... } - ... ) - >>> df.group_by("group_col").agg( - ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), - ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), - ... ).sort("group_col") - shape: (2, 3) - ┌───────────┬─────┬─────┐ - │ group_col ┆ lt ┆ gte │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═══════════╪═════╪═════╡ - │ g1 ┆ 1 ┆ 2 │ - │ g2 ┆ 0 ┆ 3 │ - └───────────┴─────┴─────┘ - - ''' - def where(self, predicate: Expr) -> Self: - ''' - Filter a single column. - - Alias for :func:`filter`. - - Parameters - ---------- - predicate - Boolean expression. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group_col": ["g1", "g1", "g2"], - ... "b": [1, 2, 3], - ... } - ... ) - >>> df.group_by("group_col").agg( - ... [ - ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), - ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), - ... ] - ... ).sort("group_col") - shape: (2, 3) - ┌───────────┬─────┬─────┐ - │ group_col ┆ lt ┆ gte │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═══════════╪═════╪═════╡ - │ g1 ┆ 1 ┆ 2 │ - │ g2 ┆ 0 ┆ 3 │ - └───────────┴─────┴─────┘ - - ''' - def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: - ''' - Apply a custom python function to a whole Series or sequence of Series. - - The output of this custom function must be a Series. If you want to apply a - custom function elementwise over single values, see :func:`map_elements`. - A reasonable use case for `map` functions is transforming the values - represented by an expression using a third-party library. - - Read more in `the book - `_. - - Parameters - ---------- - function - Lambda/function to apply. - return_dtype - Dtype of the output Series. - agg_list - Aggregate list. - - Notes - ----- - If you are looking to map a function over a window function or group_by context, - refer to func:`map_elements` instead. - - Warnings - -------- - If `return_dtype` is not provided, this may lead to unexpected results. - We allow this, but it is considered a bug in the user\'s query. - - See Also - -------- - map_elements - replace - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "sine": [0.0, 1.0, 0.0, -1.0], - ... "cosine": [1.0, 0.0, -1.0, 0.0], - ... } - ... ) - >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) - shape: (1, 2) - ┌──────┬────────┐ - │ sine ┆ cosine │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪════════╡ - │ 1 ┆ 0 │ - └──────┴────────┘ - - ''' - def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - ''' - Map a custom/user-defined function (UDF) to each element of a column. - - .. warning:: - This method is much slower than the native expressions API. - Only use it if you cannot implement your logic otherwise. - - The UDF is applied to each element of a column. Note that, in a GroupBy - context, the column will have been pre-aggregated and so each element - will itself be a Series. Therefore, depending on the context, - requirements for `function` differ: - - * Selection - Expects `function` to be of type `Callable[[Any], Any]`. - Applies a Python function to each individual value in the column. - * GroupBy - Expects `function` to be of type `Callable[[Series], Any]`. - For each group, applies a Python function to the slice of the column - corresponding to that group. - - Parameters - ---------- - function - Lambda/function to map. - return_dtype - Dtype of the output Series. - If not set, the dtype will be `pl.Unknown`. - skip_nulls - Don\'t map the function over values that contain nulls (this is faster). - pass_name - Pass the Series name to the custom function (this is more expensive). - strategy : {\'thread_local\', \'threading\'} - This functionality is considered experimental and may be removed/changed. - - - \'thread_local\': run the python function on a single thread. - - \'threading\': run the python function on separate threads. Use with - care as this can slow performance. This might only speed up - your code if the amount of work per element is significant - and the python function releases the GIL (e.g. via calling - a c function) - - Notes - ----- - * Using `map_elements` is strongly discouraged as you will be effectively - running python "for" loops, which will be very slow. Wherever possible you - should prefer the native expression API to achieve the best performance. - - * If your function is expensive and you don\'t want it to be called more than - once for a given input, consider applying an `@lru_cache` decorator to it. - If your data is suitable you may achieve *significant* speedups. - - * Window function application using `over` is considered a GroupBy context - here, so `map_elements` can be used to map functions over window groups. - - Warnings - -------- - If `return_dtype` is not provided, this may lead to unexpected results. - We allow this, but it is considered a bug in the user\'s query. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["a", "b", "c", "c"], - ... } - ... ) - - The function is applied to each element of column `\'a\'`: - - >>> df.with_columns( # doctest: +SKIP - ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), - ... ) - shape: (4, 3) - ┌─────┬─────┬───────────┐ - │ a ┆ b ┆ a_times_2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 │ - ╞═════╪═════╪═══════════╡ - │ 1 ┆ a ┆ 2 │ - │ 2 ┆ b ┆ 4 │ - │ 3 ┆ c ┆ 6 │ - │ 1 ┆ c ┆ 2 │ - └─────┴─────┴───────────┘ - - Tip: it is better to implement this with an expression: - - >>> df.with_columns( - ... (pl.col("a") * 2).alias("a_times_2"), - ... ) # doctest: +IGNORE_RESULT - - In a GroupBy context, each element of the column is itself a Series: - - >>> ( - ... df.lazy().group_by("b").agg(pl.col("a")).collect() - ... ) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬───────────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════╪═══════════╡ - │ a ┆ [1] │ - │ b ┆ [2] │ - │ c ┆ [3, 1] │ - └─────┴───────────┘ - - Therefore, from the user\'s point-of-view, the function is applied per-group: - - >>> ( - ... df.lazy() - ... .group_by("b") - ... .agg(pl.col("a").map_elements(lambda x: x.sum())) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬─────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 1 │ - │ b ┆ 2 │ - │ c ┆ 4 │ - └─────┴─────┘ - - Tip: again, it is better to implement this with an expression: - - >>> ( - ... df.lazy() - ... .group_by("b", maintain_order=True) - ... .agg(pl.col("a").sum()) - ... .collect() - ... ) # doctest: +IGNORE_RESULT - - Window function application using `over` will behave as a GroupBy - context, with your function receiving individual window groups: - - >>> df = pl.DataFrame( - ... { - ... "key": ["x", "x", "y", "x", "y", "z"], - ... "val": [1, 1, 1, 1, 1, 1], - ... } - ... ) - >>> df.with_columns( - ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), - ... ).sort("key") - shape: (6, 3) - ┌─────┬─────┬────────┐ - │ key ┆ val ┆ scaled │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪════════╡ - │ x ┆ 1 ┆ 3 │ - │ x ┆ 1 ┆ 3 │ - │ x ┆ 1 ┆ 3 │ - │ y ┆ 1 ┆ 2 │ - │ y ┆ 1 ┆ 2 │ - │ z ┆ 1 ┆ 1 │ - └─────┴─────┴────────┘ - - Note that this function would *also* be better-implemented natively: - - >>> df.with_columns( - ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), - ... ).sort( - ... "key" - ... ) # doctest: +IGNORE_RESULT - - ''' - def flatten(self) -> Self: - ''' - Flatten a list or string column. - - Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": ["a", "b", "b"], - ... "values": [[1, 2], [2, 3], [4]], - ... } - ... ) - >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP - shape: (2, 2) - ┌───────┬───────────┐ - │ group ┆ values │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═══════╪═══════════╡ - │ a ┆ [1, 2] │ - │ b ┆ [2, 3, 4] │ - └───────┴───────────┘ - - ''' - def explode(self) -> Self: - ''' - Explode a list expression. - - This means that every item is expanded to a new row. - - Returns - ------- - Expr - Expression with the data type of the list elements. - - See Also - -------- - Expr.list.explode : Explode a list column. - Expr.str.explode : Explode a string column. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "group": ["a", "b"], - ... "values": [ - ... [1, 2], - ... [3, 4], - ... ], - ... } - ... ) - >>> df.select(pl.col("values").explode()) - shape: (4, 1) - ┌────────┐ - │ values │ - │ --- │ - │ i64 │ - ╞════════╡ - │ 1 │ - │ 2 │ - │ 3 │ - │ 4 │ - └────────┘ - - ''' - def implode(self) -> Self: - ''' - Aggregate values into a list. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": [4, 5, 6], - ... } - ... ) - >>> df.select(pl.all().implode()) - shape: (1, 2) - ┌───────────┬───────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ list[i64] ┆ list[i64] │ - ╞═══════════╪═══════════╡ - │ [1, 2, 3] ┆ [4, 5, 6] │ - └───────────┴───────────┘ - - ''' - def gather_every(self, n: int) -> Self: - ''' - Take every nth value in the Series and return as a new Series. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) - >>> df.select(pl.col("foo").gather_every(3)) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 4 │ - │ 7 │ - └─────┘ - - ''' - def head(self, n: int | Expr = ...) -> Self: - ''' - Get the first `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.head(3) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - ''' - def tail(self, n: int | Expr = ...) -> Self: - ''' - Get the last `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.tail(3) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 5 │ - │ 6 │ - │ 7 │ - └─────┘ - - ''' - def limit(self, n: int | Expr = ...) -> Self: - ''' - Get the first `n` rows (alias for :func:`Expr.head`). - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.limit(3) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - ''' - def and_(self, *others: Any) -> Self: - ''' - Method equivalent of bitwise "and" operator `expr & other & ...`. - - Parameters - ---------- - *others - One or more integer or boolean expressions to evaluate/combine. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5, 6, 7, 4, 8], - ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], - ... "z": [-9, 2, -1, 4, 8], - ... } - ... ) - >>> df.select( - ... (pl.col("x") >= pl.col("z")) - ... .and_( - ... pl.col("y") >= pl.col("z"), - ... pl.col("y") == pl.col("y"), - ... pl.col("z") <= pl.col("x"), - ... pl.col("y") != pl.col("x"), - ... ) - ... .alias("all") - ... ) - shape: (5, 1) - ┌───────┐ - │ all │ - │ --- │ - │ bool │ - ╞═══════╡ - │ true │ - │ true │ - │ true │ - │ false │ - │ false │ - └───────┘ - - ''' - def or_(self, *others: Any) -> Self: - ''' - Method equivalent of bitwise "or" operator `expr | other | ...`. - - Parameters - ---------- - *others - One or more integer or boolean expressions to evaluate/combine. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5, 6, 7, 4, 8], - ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], - ... "z": [-9, 2, -1, 4, 8], - ... } - ... ) - >>> df.select( - ... (pl.col("x") == pl.col("y")) - ... .or_( - ... pl.col("x") == pl.col("y"), - ... pl.col("y") == pl.col("z"), - ... pl.col("y").cast(int) == pl.col("z"), - ... ) - ... .alias("any") - ... ) - shape: (5, 1) - ┌───────┐ - │ any │ - │ --- │ - │ bool │ - ╞═══════╡ - │ false │ - │ true │ - │ false │ - │ true │ - │ false │ - └───────┘ - - ''' - def eq(self, other: Any) -> Self: - ''' - Method equivalent of equality operator `expr == other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0], - ... "y": [2.0, 2.0, float("nan"), 4.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").eq(pl.col("y")).alias("x == y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x == y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 1.0 ┆ 2.0 ┆ false │ - │ 2.0 ┆ 2.0 ┆ true │ - │ NaN ┆ NaN ┆ false │ - │ 4.0 ┆ 4.0 ┆ true │ - └─────┴─────┴────────┘ - - ''' - def eq_missing(self, other: Any) -> Self: - ''' - Method equivalent of equality operator `expr == other` where `None == None`. - - This differs from default `eq` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], - ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").eq(pl.col("y")).alias("x eq y"), - ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), - ... ) - shape: (6, 4) - ┌──────┬──────┬────────┬────────────────┐ - │ x ┆ y ┆ x eq y ┆ x eq_missing y │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪══════╪════════╪════════════════╡ - │ 1.0 ┆ 2.0 ┆ false ┆ false │ - │ 2.0 ┆ 2.0 ┆ true ┆ true │ - │ NaN ┆ NaN ┆ false ┆ false │ - │ 4.0 ┆ 4.0 ┆ true ┆ true │ - │ null ┆ 5.0 ┆ null ┆ false │ - │ null ┆ null ┆ null ┆ true │ - └──────┴──────┴────────┴────────────────┘ - - ''' - def ge(self, other: Any) -> Self: - ''' - Method equivalent of "greater than or equal" operator `expr >= other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5.0, 4.0, float("nan"), 2.0], - ... "y": [5.0, 3.0, float("nan"), 1.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").ge(pl.col("y")).alias("x >= y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x >= y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 5.0 ┆ 5.0 ┆ true │ - │ 4.0 ┆ 3.0 ┆ true │ - │ NaN ┆ NaN ┆ false │ - │ 2.0 ┆ 1.0 ┆ true │ - └─────┴─────┴────────┘ - - ''' - def gt(self, other: Any) -> Self: - ''' - Method equivalent of "greater than" operator `expr > other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5.0, 4.0, float("nan"), 2.0], - ... "y": [5.0, 3.0, float("nan"), 1.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").gt(pl.col("y")).alias("x > y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬───────┐ - │ x ┆ y ┆ x > y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪═══════╡ - │ 5.0 ┆ 5.0 ┆ false │ - │ 4.0 ┆ 3.0 ┆ true │ - │ NaN ┆ NaN ┆ false │ - │ 2.0 ┆ 1.0 ┆ true │ - └─────┴─────┴───────┘ - - ''' - def le(self, other: Any) -> Self: - ''' - Method equivalent of "less than or equal" operator `expr <= other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [5.0, 4.0, float("nan"), 0.5], - ... "y": [5.0, 3.5, float("nan"), 2.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").le(pl.col("y")).alias("x <= y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x <= y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 5.0 ┆ 5.0 ┆ true │ - │ 4.0 ┆ 3.5 ┆ false │ - │ NaN ┆ NaN ┆ false │ - │ 0.5 ┆ 2.0 ┆ true │ - └─────┴─────┴────────┘ - - ''' - def lt(self, other: Any) -> Self: - ''' - Method equivalent of "less than" operator `expr < other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 3.0], - ... "y": [2.0, 2.0, float("nan"), 4.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").lt(pl.col("y")).alias("x < y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬───────┐ - │ x ┆ y ┆ x < y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪═══════╡ - │ 1.0 ┆ 2.0 ┆ true │ - │ 2.0 ┆ 2.0 ┆ false │ - │ NaN ┆ NaN ┆ false │ - │ 3.0 ┆ 4.0 ┆ true │ - └─────┴─────┴───────┘ - - ''' - def ne(self, other: Any) -> Self: - ''' - Method equivalent of inequality operator `expr != other`. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0], - ... "y": [2.0, 2.0, float("nan"), 4.0], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").ne(pl.col("y")).alias("x != y"), - ... ) - shape: (4, 3) - ┌─────┬─────┬────────┐ - │ x ┆ y ┆ x != y │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪═════╪════════╡ - │ 1.0 ┆ 2.0 ┆ true │ - │ 2.0 ┆ 2.0 ┆ false │ - │ NaN ┆ NaN ┆ true │ - │ 4.0 ┆ 4.0 ┆ false │ - └─────┴─────┴────────┘ - - ''' - def ne_missing(self, other: Any) -> Self: - ''' - Method equivalent of equality operator `expr != other` where `None == None`. - - This differs from default `ne` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - Examples - -------- - >>> df = pl.DataFrame( - ... data={ - ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], - ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], - ... } - ... ) - >>> df.with_columns( - ... pl.col("x").ne(pl.col("y")).alias("x ne y"), - ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), - ... ) - shape: (6, 4) - ┌──────┬──────┬────────┬────────────────┐ - │ x ┆ y ┆ x ne y ┆ x ne_missing y │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪══════╪════════╪════════════════╡ - │ 1.0 ┆ 2.0 ┆ true ┆ true │ - │ 2.0 ┆ 2.0 ┆ false ┆ false │ - │ NaN ┆ NaN ┆ true ┆ true │ - │ 4.0 ┆ 4.0 ┆ false ┆ false │ - │ null ┆ 5.0 ┆ null ┆ true │ - │ null ┆ null ┆ null ┆ false │ - └──────┴──────┴────────┴────────────────┘ - - ''' - def add(self, other: Any) -> Self: - ''' - Method equivalent of addition operator `expr + other`. - - Parameters - ---------- - other - numeric or string value; accepts expression input. - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) - >>> df.with_columns( - ... pl.col("x").add(2).alias("x+int"), - ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), - ... ) - shape: (5, 3) - ┌─────┬───────┬────────┐ - │ x ┆ x+int ┆ x+expr │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═══════╪════════╡ - │ 1 ┆ 3 ┆ 2 │ - │ 2 ┆ 4 ┆ 4 │ - │ 3 ┆ 5 ┆ 9 │ - │ 4 ┆ 6 ┆ 28 │ - │ 5 ┆ 7 ┆ 125 │ - └─────┴───────┴────────┘ - - >>> df = pl.DataFrame( - ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} - ... ) - >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) - shape: (3, 4) - ┌─────┬─────┬─────┬─────┐ - │ x ┆ y ┆ z ┆ xyz │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ str ┆ str │ - ╞═════╪═════╪═════╪═════╡ - │ a ┆ b ┆ c ┆ abc │ - │ d ┆ e ┆ f ┆ def │ - │ g ┆ h ┆ i ┆ ghi │ - └─────┴─────┴─────┴─────┘ - - ''' - def floordiv(self, other: Any) -> Self: - ''' - Method equivalent of integer division operator `expr // other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - See Also - -------- - truediv - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) - >>> df.with_columns( - ... pl.col("x").truediv(2).alias("x/2"), - ... pl.col("x").floordiv(2).alias("x//2"), - ... ) - shape: (5, 3) - ┌─────┬─────┬──────┐ - │ x ┆ x/2 ┆ x//2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ i64 │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 0.5 ┆ 0 │ - │ 2 ┆ 1.0 ┆ 1 │ - │ 3 ┆ 1.5 ┆ 1 │ - │ 4 ┆ 2.0 ┆ 2 │ - │ 5 ┆ 2.5 ┆ 2 │ - └─────┴─────┴──────┘ - - ''' - def mod(self, other: Any) -> Self: - ''' - Method equivalent of modulus operator `expr % other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) - >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) - shape: (5, 2) - ┌─────┬─────┐ - │ x ┆ x%2 │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 0 ┆ 0 │ - │ 1 ┆ 1 │ - │ 2 ┆ 0 │ - │ 3 ┆ 1 │ - │ 4 ┆ 0 │ - └─────┴─────┘ - - ''' - def mul(self, other: Any) -> Self: - ''' - Method equivalent of multiplication operator `expr * other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) - >>> df.with_columns( - ... pl.col("x").mul(2).alias("x*2"), - ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), - ... ) - shape: (5, 3) - ┌─────┬─────┬───────────┐ - │ x ┆ x*2 ┆ x * xlog2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ f64 │ - ╞═════╪═════╪═══════════╡ - │ 1 ┆ 2 ┆ 0.0 │ - │ 2 ┆ 4 ┆ 2.0 │ - │ 4 ┆ 8 ┆ 8.0 │ - │ 8 ┆ 16 ┆ 24.0 │ - │ 16 ┆ 32 ┆ 64.0 │ - └─────┴─────┴───────────┘ - - ''' - def sub(self, other: Any) -> Self: - ''' - Method equivalent of subtraction operator `expr - other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) - >>> df.with_columns( - ... pl.col("x").sub(2).alias("x-2"), - ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), - ... ) - shape: (5, 3) - ┌─────┬─────┬────────┐ - │ x ┆ x-2 ┆ x-expr │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪════════╡ - │ 0 ┆ -2 ┆ 0 │ - │ 1 ┆ -1 ┆ 0 │ - │ 2 ┆ 0 ┆ -1 │ - │ 3 ┆ 1 ┆ -3 │ - │ 4 ┆ 2 ┆ -6 │ - └─────┴─────┴────────┘ - - ''' - def truediv(self, other: Any) -> Self: - ''' - Method equivalent of float division operator `expr / other`. - - Parameters - ---------- - other - Numeric literal or expression value. - - Notes - ----- - Zero-division behaviour follows IEEE-754: - - 0/0: Invalid operation - mathematically undefined, returns NaN. - n/0: On finite operands gives an exact infinite result, eg: ±infinity. - - See Also - -------- - floordiv - - Examples - -------- - >>> df = pl.DataFrame( - ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} - ... ) - >>> df.with_columns( - ... pl.col("x").truediv(2).alias("x/2"), - ... pl.col("x").truediv(pl.col("y")).alias("x/y"), - ... ) - shape: (5, 4) - ┌─────┬──────┬──────┬───────┐ - │ x ┆ y ┆ x/2 ┆ x/y │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ f64 ┆ f64 │ - ╞═════╪══════╪══════╪═══════╡ - │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ - │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ - │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ - │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ - │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ - └─────┴──────┴──────┴───────┘ - - ''' - def pow(self, exponent: int | float | None | Series | Expr) -> Self: - ''' - Method equivalent of exponentiation operator `expr ** exponent`. - - Parameters - ---------- - exponent - Numeric literal or expression exponent value. - - Examples - -------- - >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) - >>> df.with_columns( - ... pl.col("x").pow(3).alias("cube"), - ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), - ... ) - shape: (4, 3) - ┌─────┬───────┬────────────┐ - │ x ┆ cube ┆ x ** xlog2 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ f64 │ - ╞═════╪═══════╪════════════╡ - │ 1 ┆ 1.0 ┆ 1.0 │ - │ 2 ┆ 8.0 ┆ 2.0 │ - │ 4 ┆ 64.0 ┆ 16.0 │ - │ 8 ┆ 512.0 ┆ 512.0 │ - └─────┴───────┴────────────┘ - - ''' - def xor(self, other: Any) -> Self: - ''' - Method equivalent of bitwise exclusive-or operator `expr ^ other`. - - Parameters - ---------- - other - Integer or boolean value; accepts expression input. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"x": [True, False, True, False], "y": [True, True, False, False]} - ... ) - >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) - shape: (4, 3) - ┌───────┬───────┬───────┐ - │ x ┆ y ┆ x ^ y │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞═══════╪═══════╪═══════╡ - │ true ┆ true ┆ false │ - │ false ┆ true ┆ true │ - │ true ┆ false ┆ true │ - │ false ┆ false ┆ false │ - └───────┴───────┴───────┘ - - >>> def binary_string(n: int) -> str: - ... return bin(n)[2:].zfill(8) - >>> - >>> df = pl.DataFrame( - ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, - ... schema={"x": pl.UInt8, "y": pl.UInt8}, - ... ) - >>> df.with_columns( - ... pl.col("x").map_elements(binary_string).alias("bin_x"), - ... pl.col("y").map_elements(binary_string).alias("bin_y"), - ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), - ... pl.col("x") - ... .xor(pl.col("y")) - ... .map_elements(binary_string) - ... .alias("bin_xor_xy"), - ... ) - shape: (4, 6) - ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ - │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ - ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ - │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ - │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ - │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ - │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ - └─────┴─────┴──────────┴──────────┴────────┴────────────┘ - - ''' - def is_in(self, other: Expr | Collection[Any] | Series) -> Self: - ''' - Check if elements of this expression are present in the other Series. - - Parameters - ---------- - other - Series or sequence of primitive type. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} - ... ) - >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) - shape: (3, 3) - ┌───────────┬──────────────────┬──────────┐ - │ sets ┆ optional_members ┆ contains │ - │ --- ┆ --- ┆ --- │ - │ list[i64] ┆ i64 ┆ bool │ - ╞═══════════╪══════════════════╪══════════╡ - │ [1, 2, 3] ┆ 1 ┆ true │ - │ [1, 2] ┆ 2 ┆ true │ - │ [9, 10] ┆ 3 ┆ false │ - └───────────┴──────────────────┴──────────┘ - - ''' - def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: - ''' - Repeat the elements in this Series as specified in the given expression. - - The repeated elements are expanded into a `List`. - - Parameters - ---------- - by - Numeric column that determines how often the values will be repeated. - The column will be coerced to UInt32. Give this dtype to make the coercion a - no-op. - - Returns - ------- - Expr - Expression of data type :class:`List`, where the inner data type is equal - to the original data type. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": ["x", "y", "z"], - ... "n": [1, 2, 3], - ... } - ... ) - >>> df.select(pl.col("a").repeat_by("n")) - shape: (3, 1) - ┌─────────────────┐ - │ a │ - │ --- │ - │ list[str] │ - ╞═════════════════╡ - │ ["x"] │ - │ ["y", "y"] │ - │ ["z", "z", "z"] │ - └─────────────────┘ - - ''' - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: - ''' - Check if this expression is between the given start and end values. - - Parameters - ---------- - lower_bound - Lower bound value. Accepts expression input. Strings are parsed as column - names, other non-expression inputs are parsed as literals. - upper_bound - Upper bound value. Accepts expression input. Strings are parsed as column - names, other non-expression inputs are parsed as literals. - closed : {\'both\', \'left\', \'right\', \'none\'} - Define which sides of the interval are closed (inclusive). - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - Examples - -------- - >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) - >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) - shape: (5, 2) - ┌─────┬────────────┐ - │ num ┆ is_between │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪════════════╡ - │ 1 ┆ false │ - │ 2 ┆ true │ - │ 3 ┆ true │ - │ 4 ┆ true │ - │ 5 ┆ false │ - └─────┴────────────┘ - - Use the `closed` argument to include or exclude the values at the bounds: - - >>> df.with_columns( - ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") - ... ) - shape: (5, 2) - ┌─────┬────────────┐ - │ num ┆ is_between │ - │ --- ┆ --- │ - │ i64 ┆ bool │ - ╞═════╪════════════╡ - │ 1 ┆ false │ - │ 2 ┆ true │ - │ 3 ┆ true │ - │ 4 ┆ false │ - │ 5 ┆ false │ - └─────┴────────────┘ - - You can also use strings as well as numeric/temporal values (note: ensure that - string literals are wrapped with `lit` so as not to conflate them with - column names): - - >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) - >>> df.with_columns( - ... pl.col("a") - ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") - ... .alias("is_between") - ... ) - shape: (5, 2) - ┌─────┬────────────┐ - │ a ┆ is_between │ - │ --- ┆ --- │ - │ str ┆ bool │ - ╞═════╪════════════╡ - │ a ┆ true │ - │ b ┆ true │ - │ c ┆ true │ - │ d ┆ false │ - │ e ┆ false │ - └─────┴────────────┘ - - ''' - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: - ''' - Hash the elements in the selection. - - The hash value is of type `UInt64`. - - Parameters - ---------- - seed - Random seed parameter. Defaults to 0. - seed_1 - Random seed parameter. Defaults to `seed` if not set. - seed_2 - Random seed parameter. Defaults to `seed` if not set. - seed_3 - Random seed parameter. Defaults to `seed` if not set. - - Notes - ----- - This implementation of :func:`rows` does not guarantee stable results - across different Polars versions. Its stability is only guaranteed within a - single version. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [1, 2, None], - ... "b": ["x", None, "z"], - ... } - ... ) - >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌──────────────────────┬──────────────────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u64 ┆ u64 │ - ╞══════════════════════╪══════════════════════╡ - │ 9774092659964970114 ┆ 13614470193936745724 │ - │ 1101441246220388612 ┆ 11638928888656214026 │ - │ 11638928888656214026 ┆ 13382926553367784577 │ - └──────────────────────┴──────────────────────┘ - - ''' - def reinterpret(self) -> Self: - ''' - Reinterpret the underlying bits as a signed/unsigned integer. - - This operation is only allowed for 64bit integers. For lower bits integers, - you can safely use that cast operation. - - Parameters - ---------- - signed - If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. - - Examples - -------- - >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) - >>> df = pl.DataFrame([s]) - >>> df.select( - ... [ - ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), - ... pl.col("a").alias("original"), - ... ] - ... ) - shape: (3, 2) - ┌───────────────┬──────────┐ - │ reinterpreted ┆ original │ - │ --- ┆ --- │ - │ i64 ┆ u64 │ - ╞═══════════════╪══════════╡ - │ 1 ┆ 1 │ - │ 1 ┆ 1 │ - │ 2 ┆ 2 │ - └───────────────┴──────────┘ - - ''' - def inspect(self, fmt: str = ...) -> Self: - ''' - Print the value that this expression evaluates to and pass on the value. - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 1, 2]}) - >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) - value is: shape: (3,) - Series: \'foo\' [i64] - [ - 1 - 2 - 4 - ] - shape: (3, 1) - ┌─────┐ - │ bar │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 4 │ - └─────┘ - - ''' - def interpolate(self, method: InterpolationMethod = ...) -> Self: - ''' - Fill null values using interpolation. - - Parameters - ---------- - method : {\'linear\', \'nearest\'} - Interpolation method. - - Examples - -------- - Fill null values using linear interpolation. - - >>> df = pl.DataFrame( - ... { - ... "a": [1, None, 3], - ... "b": [1.0, float("nan"), 3.0], - ... } - ... ) - >>> df.select(pl.all().interpolate()) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 1.0 ┆ 1.0 │ - │ 2.0 ┆ NaN │ - │ 3.0 ┆ 3.0 │ - └─────┴─────┘ - - Fill null values using nearest interpolation. - - >>> df.select(pl.all().interpolate("nearest")) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪═════╡ - │ 1 ┆ 1.0 │ - │ 3 ┆ NaN │ - │ 3 ┆ 3.0 │ - └─────┴─────┘ - - Regrid data to a new grid. - - >>> df_original_grid = pl.DataFrame( - ... { - ... "grid_points": [1, 3, 10], - ... "values": [2.0, 6.0, 20.0], - ... } - ... ) # Interpolate from this to the new grid - >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) - >>> df_new_grid.join( - ... df_original_grid, on="grid_points", how="left" - ... ).with_columns(pl.col("values").interpolate()) - shape: (10, 2) - ┌─────────────┬────────┐ - │ grid_points ┆ values │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════════════╪════════╡ - │ 1 ┆ 2.0 │ - │ 2 ┆ 4.0 │ - │ 3 ┆ 6.0 │ - │ 4 ┆ 8.0 │ - │ … ┆ … │ - │ 7 ┆ 14.0 │ - │ 8 ┆ 16.0 │ - │ 9 ┆ 18.0 │ - │ 10 ┆ 20.0 │ - └─────────────┴────────┘ - - ''' - def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling min (moving min) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their min. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic - temporal size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_min=pl.col("A").rolling_min(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_min │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 2.0 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ 4.0 │ - │ 6.0 ┆ 5.0 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_min=pl.col("A").rolling_min( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_min │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.25 │ - │ 3.0 ┆ 0.5 │ - │ 4.0 ┆ 0.75 │ - │ 5.0 ┆ 1.0 │ - │ 6.0 ┆ 1.25 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_min │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 2.0 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ 4.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - >>> df_temporal.with_columns( - ... rolling_row_min=pl.col("row_nr").rolling_min( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_min │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling max (moving max) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their max. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_max=pl.col("A").rolling_max(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_max │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 2.0 │ - │ 3.0 ┆ 3.0 │ - │ 4.0 ┆ 4.0 │ - │ 5.0 ┆ 5.0 │ - │ 6.0 ┆ 6.0 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_max=pl.col("A").rolling_max( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_max │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.25 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ 3.75 │ - │ 6.0 ┆ 4.5 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_max │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 3.0 │ - │ 3.0 ┆ 4.0 │ - │ 4.0 ┆ 5.0 │ - │ 5.0 ┆ 6.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling max with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_max=pl.col("row_nr").rolling_max( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_max │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling max with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_max=pl.col("row_nr").rolling_max( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_max │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling mean (moving mean) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their mean. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_mean=pl.col("A").rolling_mean(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬──────────────┐ - │ A ┆ rolling_mean │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.5 │ - │ 4.0 ┆ 3.5 │ - │ 5.0 ┆ 4.5 │ - │ 6.0 ┆ 5.5 │ - └─────┴──────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_mean=pl.col("A").rolling_mean( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────┐ - │ A ┆ rolling_mean │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.75 │ - │ 3.0 ┆ 2.75 │ - │ 4.0 ┆ 3.75 │ - │ 5.0 ┆ 4.75 │ - │ 6.0 ┆ 5.75 │ - └─────┴──────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬──────────────┐ - │ A ┆ rolling_mean │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 2.0 │ - │ 3.0 ┆ 3.0 │ - │ 4.0 ┆ 4.0 │ - │ 5.0 ┆ 5.0 │ - │ 6.0 ┆ null │ - └─────┴──────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling mean with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_mean=pl.col("row_nr").rolling_mean( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬──────────────────┐ - │ row_nr ┆ date ┆ rolling_row_mean │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪══════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ - └────────┴─────────────────────┴──────────────────┘ - - Compute the rolling mean with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_mean=pl.col("row_nr").rolling_mean( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬──────────────────┐ - │ row_nr ┆ date ┆ rolling_row_mean │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪══════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ - └────────┴─────────────────────┴──────────────────┘ - - ''' - def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Apply a rolling sum (moving sum) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their sum. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - of dtype `{Date, Datetime}` - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_sum=pl.col("A").rolling_sum(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_sum │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 3.0 │ - │ 3.0 ┆ 5.0 │ - │ 4.0 ┆ 7.0 │ - │ 5.0 ┆ 9.0 │ - │ 6.0 ┆ 11.0 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_sum=pl.col("A").rolling_sum( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_sum │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.75 │ - │ 3.0 ┆ 2.75 │ - │ 4.0 ┆ 3.75 │ - │ 5.0 ┆ 4.75 │ - │ 6.0 ┆ 5.75 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_sum │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 6.0 │ - │ 3.0 ┆ 9.0 │ - │ 4.0 ┆ 12.0 │ - │ 5.0 ┆ 15.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling sum with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_sum=pl.col("row_nr").rolling_sum( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_sum │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling sum with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_sum=pl.col("row_nr").rolling_sum( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_sum │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling standard deviation. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` means - the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_std=pl.col("A").rolling_std(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_std │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.707107 │ - │ 3.0 ┆ 0.707107 │ - │ 4.0 ┆ 0.707107 │ - │ 5.0 ┆ 0.707107 │ - │ 6.0 ┆ 0.707107 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_std=pl.col("A").rolling_std( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_std │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.433013 │ - │ 3.0 ┆ 0.433013 │ - │ 4.0 ┆ 0.433013 │ - │ 5.0 ┆ 0.433013 │ - │ 6.0 ┆ 0.433013 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_std │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 1.0 │ - │ 4.0 ┆ 1.0 │ - │ 5.0 ┆ 1.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling std with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_std=pl.col("row_nr").rolling_std( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_std │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling std with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_std=pl.col("row_nr").rolling_std( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_std │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling variance. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_var=pl.col("A").rolling_var(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_var │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.5 │ - │ 3.0 ┆ 0.5 │ - │ 4.0 ┆ 0.5 │ - │ 5.0 ┆ 0.5 │ - │ 6.0 ┆ 0.5 │ - └─────┴─────────────┘ - - Specify weights to multiply the values in the window with: - - >>> df.with_columns( - ... rolling_var=pl.col("A").rolling_var( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_var │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 0.1875 │ - │ 3.0 ┆ 0.1875 │ - │ 4.0 ┆ 0.1875 │ - │ 5.0 ┆ 0.1875 │ - │ 6.0 ┆ 0.1875 │ - └─────┴─────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬─────────────┐ - │ A ┆ rolling_var │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.0 │ - │ 3.0 ┆ 1.0 │ - │ 4.0 ┆ 1.0 │ - │ 5.0 ┆ 1.0 │ - │ 6.0 ┆ null │ - └─────┴─────────────┘ - - Create a DataFrame with a datetime column and a row number column - - >>> from datetime import timedelta, datetime - >>> start = datetime(2001, 1, 1) - >>> stop = datetime(2001, 1, 2) - >>> df_temporal = pl.DataFrame( - ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() - >>> df_temporal - shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ - - Compute the rolling var with the default left closure of temporal windows - - >>> df_temporal.with_columns( - ... rolling_row_var=pl.col("row_nr").rolling_var( - ... window_size="2h", by="date", closed="left" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_var │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ - └────────┴─────────────────────┴─────────────────┘ - - Compute the rolling var with the closure of windows on both sides - - >>> df_temporal.with_columns( - ... rolling_row_var=pl.col("row_nr").rolling_var( - ... window_size="2h", by="date", closed="both" - ... ) - ... ) - shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_var │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ - └────────┴─────────────────────┴─────────────────┘ - - ''' - def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling median. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` means - the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - window_size - The length of the window. Can be a fixed integer size, or a dynamic temporal - size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_median=pl.col("A").rolling_median(window_size=2), - ... ) - shape: (6, 2) - ┌─────┬────────────────┐ - │ A ┆ rolling_median │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.5 │ - │ 4.0 ┆ 3.5 │ - │ 5.0 ┆ 4.5 │ - │ 6.0 ┆ 5.5 │ - └─────┴────────────────┘ - - Specify weights for the values in each window: - - >>> df.with_columns( - ... rolling_median=pl.col("A").rolling_median( - ... window_size=2, weights=[0.25, 0.75] - ... ), - ... ) - shape: (6, 2) - ┌─────┬────────────────┐ - │ A ┆ rolling_median │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 1.5 │ - │ 3.0 ┆ 2.5 │ - │ 4.0 ┆ 3.5 │ - │ 5.0 ┆ 4.5 │ - │ 6.0 ┆ 5.5 │ - └─────┴────────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), - ... ) - shape: (6, 2) - ┌─────┬────────────────┐ - │ A ┆ rolling_median │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ 2.0 │ - │ 3.0 ┆ 3.0 │ - │ 4.0 ┆ 4.0 │ - │ 5.0 ┆ 5.0 │ - │ 6.0 ┆ null │ - └─────┴────────────────┘ - - ''' - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a rolling quantile. - - If `by` has not been specified (the default), the window at a given row will - include the row itself, and the `window_size - 1` elements before it. - - If you pass a `by` column ``, then `closed="left"` - means the windows will be: - - - [t_0 - window_size, t_0) - - [t_1 - window_size, t_1) - - ... - - [t_n - window_size, t_n) - - With `closed="right"`, the left endpoint is not included and the right - endpoint is included. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - window_size - The length of the window. Can be a fixed integer size, or a dynamic - temporal size indicated by a timedelta or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - If a timedelta or the dynamic string language is used, the `by` - and `closed` arguments must also be set. - weights - An optional slice with the same length as the window that determines the - relative contribution of each value in a window to the output. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - by - If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must - set the column that will be used to determine the windows. This column must - be of dtype Datetime or Date. - - .. warning:: - If passed, the column must be sorted in ascending order. Otherwise, - results will not be correct. - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive); only - applicable if `by` has been set. - warn_if_unsorted - Warn if data is not known to be sorted by `by` column (if passed). - Experimental. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Notes - ----- - If you want to compute multiple aggregation statistics over the same dynamic - window, consider using `rolling` - this method can cache the window size - computation. - - Examples - -------- - >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.25, window_size=4 - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ null │ - │ 4.0 ┆ 2.0 │ - │ 5.0 ┆ 3.0 │ - │ 6.0 ┆ 4.0 │ - └─────┴──────────────────┘ - - Specify weights for the values in each window: - - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ null │ - │ 4.0 ┆ 2.0 │ - │ 5.0 ┆ 3.0 │ - │ 6.0 ┆ 4.0 │ - └─────┴──────────────────┘ - - Specify weights and interpolation method - - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.25, - ... window_size=4, - ... weights=[0.2, 0.4, 0.4, 0.2], - ... interpolation="linear", - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ null │ - │ 4.0 ┆ 1.625 │ - │ 5.0 ┆ 2.625 │ - │ 6.0 ┆ 3.625 │ - └─────┴──────────────────┘ - - Center the values in the window - - >>> df.with_columns( - ... rolling_quantile=pl.col("A").rolling_quantile( - ... quantile=0.2, window_size=5, center=True - ... ), - ... ) - shape: (6, 2) - ┌─────┬──────────────────┐ - │ A ┆ rolling_quantile │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════════════════╡ - │ 1.0 ┆ null │ - │ 2.0 ┆ null │ - │ 3.0 ┆ 2.0 │ - │ 4.0 ┆ 3.0 │ - │ 5.0 ┆ null │ - │ 6.0 ┆ null │ - └─────┴──────────────────┘ - - ''' - def rolling_skew(self, window_size: int) -> Self: - ''' - Compute a rolling skew. - - The window at a given row includes the row itself and the - `window_size - 1` elements before it. - - Parameters - ---------- - window_size - Integer size of the rolling window. - bias - If False, the calculations are corrected for statistical bias. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) - >>> df.select(pl.col("a").rolling_skew(3)) - shape: (4, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ null │ - │ null │ - │ 0.381802 │ - │ 0.47033 │ - └──────────┘ - - Note how the values match the following: - - >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() - (0.38180177416060584, 0.47033046033698594) - - ''' - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - ''' - Compute a custom rolling window function. - - .. warning:: - Computing custom functions is extremely slow. Use specialized rolling - functions such as :func:`Expr.rolling_sum` if at all possible. - - Parameters - ---------- - function - Custom aggregation function. - window_size - Size of the window. The window at a given row will include the row - itself and the `window_size - 1` elements before it. - weights - A list of weights with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window. - - Examples - -------- - >>> from numpy import nansum - >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) - >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) - shape: (5, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ null │ - │ null │ - │ 22.0 │ - │ 11.0 │ - │ 17.0 │ - └──────┘ - - ''' - def abs(self) -> Self: - ''' - Compute absolute values. - - Same as `abs(expr)`. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "A": [-1.0, 0.0, 1.0, 2.0], - ... } - ... ) - >>> df.select(pl.col("A").abs()) - shape: (4, 1) - ┌─────┐ - │ A │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - │ 0.0 │ - │ 1.0 │ - │ 2.0 │ - └─────┘ - - ''' - def rank(self, method: RankMethod = ...) -> Self: - ''' - Assign ranks to data, dealing with ties appropriately. - - Parameters - ---------- - method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} - The method used to assign ranks to tied elements. - The following methods are available (default is \'average\'): - - - \'average\' : The average of the ranks that would have been assigned to - all the tied values is assigned to each value. - - \'min\' : The minimum of the ranks that would have been assigned to all - the tied values is assigned to each value. (This is also referred to - as "competition" ranking.) - - \'max\' : The maximum of the ranks that would have been assigned to all - the tied values is assigned to each value. - - \'dense\' : Like \'min\', but the rank of the next highest element is - assigned the rank immediately after those assigned to the tied - elements. - - \'ordinal\' : All values are given a distinct rank, corresponding to - the order that the values occur in the Series. - - \'random\' : Like \'ordinal\', but the rank for ties is not dependent - on the order that the values occur in the Series. - descending - Rank in descending order. - seed - If `method="random"`, use this as seed. - - Examples - -------- - The \'average\' method: - - >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) - >>> df.select(pl.col("a").rank()) - shape: (5, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 3.0 │ - │ 4.5 │ - │ 1.5 │ - │ 1.5 │ - │ 4.5 │ - └─────┘ - - The \'ordinal\' method: - - >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) - >>> df.select(pl.col("a").rank("ordinal")) - shape: (5, 1) - ┌─────┐ - │ a │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 3 │ - │ 4 │ - │ 1 │ - │ 2 │ - │ 5 │ - └─────┘ - - Use \'rank\' with \'over\' to rank within groups: - - >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) - >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) - shape: (5, 3) - ┌─────┬─────┬──────┐ - │ a ┆ b ┆ rank │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ f64 │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 6 ┆ 1.0 │ - │ 1 ┆ 7 ┆ 2.0 │ - │ 2 ┆ 5 ┆ 1.0 │ - │ 2 ┆ 14 ┆ 3.0 │ - │ 2 ┆ 11 ┆ 2.0 │ - └─────┴─────┴──────┘ - - ''' - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: - ''' - Calculate the first discrete difference between shifted items. - - Parameters - ---------- - n - Number of slots to shift. - null_behavior : {\'ignore\', \'drop\'} - How to handle null values. - - Examples - -------- - >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) - >>> df.with_columns(change=pl.col("int").diff()) - shape: (5, 2) - ┌─────┬────────┐ - │ int ┆ change │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪════════╡ - │ 20 ┆ null │ - │ 10 ┆ -10 │ - │ 30 ┆ 20 │ - │ 25 ┆ -5 │ - │ 35 ┆ 10 │ - └─────┴────────┘ - - >>> df.with_columns(change=pl.col("int").diff(n=2)) - shape: (5, 2) - ┌─────┬────────┐ - │ int ┆ change │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪════════╡ - │ 20 ┆ null │ - │ 10 ┆ null │ - │ 30 ┆ 10 │ - │ 25 ┆ 15 │ - │ 35 ┆ 5 │ - └─────┴────────┘ - - >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) - shape: (3, 1) - ┌──────┐ - │ diff │ - │ --- │ - │ i64 │ - ╞══════╡ - │ 10 │ - │ 15 │ - │ 5 │ - └──────┘ - - ''' - def pct_change(self, n: int | IntoExprColumn = ...) -> Self: - ''' - Computes percentage change between values. - - Percentage change (as fraction) between current element and most-recent - non-null element at least `n` period(s) before the current element. - - Computes the change from the previous row by default. - - Parameters - ---------- - n - periods to shift for forming percent change. - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "a": [10, 11, 12, None, 12], - ... } - ... ) - >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) - shape: (5, 2) - ┌──────┬────────────┐ - │ a ┆ pct_change │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞══════╪════════════╡ - │ 10 ┆ null │ - │ 11 ┆ 0.1 │ - │ 12 ┆ 0.090909 │ - │ null ┆ 0.0 │ - │ 12 ┆ 0.0 │ - └──────┴────────────┘ - - ''' - def skew(self) -> Self: - ''' - Compute the sample skewness of a data set. - - For normally distributed data, the skewness should be about zero. For - unimodal continuous distributions, a skewness value greater than zero means - that there is more weight in the right tail of the distribution. The - function `skewtest` can be used to determine if the skewness value - is close enough to zero, statistically speaking. - - - See scipy.stats for more information. - - Parameters - ---------- - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - Notes - ----- - The sample skewness is computed as the Fisher-Pearson coefficient - of skewness, i.e. - - .. math:: g_1=\\frac{m_3}{m_2^{3/2}} - - where - - .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i - - is the biased sample :math:`i\\texttt{th}` central moment, and - :math:`\\bar{x}` is - the sample mean. If `bias` is False, the calculations are - corrected for bias and the value computed is the adjusted - Fisher-Pearson standardized moment coefficient, i.e. - - .. math:: - G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").skew()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.343622 │ - └──────────┘ - - ''' - def kurtosis(self) -> Self: - ''' - Compute the kurtosis (Fisher or Pearson) of a dataset. - - Kurtosis is the fourth central moment divided by the square of the - variance. If Fisher\'s definition is used, then 3.0 is subtracted from - the result to give 0.0 for a normal distribution. - If bias is False then the kurtosis is calculated using k statistics to - eliminate bias coming from biased moment estimators. - - See scipy.stats for more information - - Parameters - ---------- - fisher : bool, optional - If True, Fisher\'s definition is used (normal ==> 0.0). If False, - Pearson\'s definition is used (normal ==> 3.0). - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").kurtosis()) - shape: (1, 1) - ┌───────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═══════════╡ - │ -1.153061 │ - └───────────┘ - - ''' - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: - ''' - Set values outside the given boundaries to the boundary value. - - Parameters - ---------- - lower_bound - Lower bound. Accepts expression input. - Non-expression inputs are parsed as literals. - upper_bound - Upper bound. Accepts expression input. - Non-expression inputs are parsed as literals. - - See Also - -------- - when - - Notes - ----- - This method only works for numeric and temporal columns. To clip other data - types, consider writing a `when-then-otherwise` expression. See :func:`when`. - - Examples - -------- - Specifying both a lower and upper bound: - - >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) - >>> df.with_columns(clip=pl.col("a").clip(1, 10)) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ clip │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ -50 ┆ 1 │ - │ 5 ┆ 5 │ - │ 50 ┆ 10 │ - │ null ┆ null │ - └──────┴──────┘ - - Specifying only a single bound: - - >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ clip │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ -50 ┆ -50 │ - │ 5 ┆ 5 │ - │ 50 ┆ 10 │ - │ null ┆ null │ - └──────┴──────┘ - - ''' - def lower_bound(self) -> Self: - ''' - Calculate the lower bound. - - Returns a unit Series with the lowest value possible for the dtype of this - expression. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").lower_bound()) - shape: (1, 1) - ┌──────────────────────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════════════════════╡ - │ -9223372036854775808 │ - └──────────────────────┘ - - ''' - def upper_bound(self) -> Self: - ''' - Calculate the upper bound. - - Returns a unit Series with the highest value possible for the dtype of this - expression. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) - >>> df.select(pl.col("a").upper_bound()) - shape: (1, 1) - ┌─────────────────────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════════════════════╡ - │ 9223372036854775807 │ - └─────────────────────┘ - - ''' - def sign(self) -> Self: - ''' - Compute the element-wise indication of the sign. - - The returned values can be -1, 0, or 1: - - * -1 if x < 0. - * 0 if x == 0. - * 1 if x > 0. - - (null values are preserved as-is). - - Examples - -------- - >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) - >>> df.select(pl.col("a").sign()) - shape: (5, 1) - ┌──────┐ - │ a │ - │ --- │ - │ i64 │ - ╞══════╡ - │ -1 │ - │ 0 │ - │ 0 │ - │ 1 │ - │ null │ - └──────┘ - - ''' - def sin(self) -> Self: - ''' - Compute the element-wise value for the sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.0]}) - >>> df.select(pl.col("a").sin()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def cos(self) -> Self: - ''' - Compute the element-wise value for the cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.0]}) - >>> df.select(pl.col("a").cos()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 1.0 │ - └─────┘ - - ''' - def tan(self) -> Self: - ''' - Compute the element-wise value for the tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").tan().round(2)) - shape: (1, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ 1.56 │ - └──────┘ - - ''' - def cot(self) -> Self: - ''' - Compute the element-wise value for the cotangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").cot().round(2)) - shape: (1, 1) - ┌──────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════╡ - │ 0.64 │ - └──────┘ - - ''' - def arcsin(self) -> Self: - ''' - Compute the element-wise value for the inverse sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arcsin()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.570796 │ - └──────────┘ - - ''' - def arccos(self) -> Self: - ''' - Compute the element-wise value for the inverse cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [0.0]}) - >>> df.select(pl.col("a").arccos()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.570796 │ - └──────────┘ - - ''' - def arctan(self) -> Self: - ''' - Compute the element-wise value for the inverse tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arctan()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.785398 │ - └──────────┘ - - ''' - def sinh(self) -> Self: - ''' - Compute the element-wise value for the hyperbolic sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").sinh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.175201 │ - └──────────┘ - - ''' - def cosh(self) -> Self: - ''' - Compute the element-wise value for the hyperbolic cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").cosh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.543081 │ - └──────────┘ - - ''' - def tanh(self) -> Self: - ''' - Compute the element-wise value for the hyperbolic tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").tanh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.761594 │ - └──────────┘ - - ''' - def arcsinh(self) -> Self: - ''' - Compute the element-wise value for the inverse hyperbolic sine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arcsinh()) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.881374 │ - └──────────┘ - - ''' - def arccosh(self) -> Self: - ''' - Compute the element-wise value for the inverse hyperbolic cosine. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arccosh()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 0.0 │ - └─────┘ - - ''' - def arctanh(self) -> Self: - ''' - Compute the element-wise value for the inverse hyperbolic tangent. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1.0]}) - >>> df.select(pl.col("a").arctanh()) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ inf │ - └─────┘ - - ''' - def degrees(self) -> Self: - ''' - Convert from radians to degrees. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> import math - >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) - >>> df.select(pl.col("a").degrees()) - shape: (9, 1) - ┌────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞════════╡ - │ -720.0 │ - │ -540.0 │ - │ -360.0 │ - │ -180.0 │ - │ 0.0 │ - │ 180.0 │ - │ 360.0 │ - │ 540.0 │ - │ 720.0 │ - └────────┘ - ''' - def radians(self) -> Self: - ''' - Convert from degrees to radians. - - Returns - ------- - Expr - Expression of data type :class:`Float64`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) - >>> df.select(pl.col("a").radians()) - shape: (9, 1) - ┌────────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞════════════╡ - │ -12.566371 │ - │ -9.424778 │ - │ -6.283185 │ - │ -3.141593 │ - │ 0.0 │ - │ 3.141593 │ - │ 6.283185 │ - │ 9.424778 │ - │ 12.566371 │ - └────────────┘ - ''' - def reshape(self, dimensions: tuple[int, ...]) -> Self: - ''' - Reshape this Expr to a flat Series or a Series of Lists. - - Parameters - ---------- - dimensions - Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that - dimension is inferred. - - Returns - ------- - Expr - If a single dimension is given, results in an expression of the original - data type. - If a multiple dimensions are given, results in an expression of data type - :class:`List` with shape (rows, cols). - - Examples - -------- - >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) - >>> df.select(pl.col("foo").reshape((3, 3))) - shape: (3, 1) - ┌───────────┐ - │ foo │ - │ --- │ - │ list[i64] │ - ╞═══════════╡ - │ [1, 2, 3] │ - │ [4, 5, 6] │ - │ [7, 8, 9] │ - └───────────┘ - - See Also - -------- - Expr.list.explode : Explode a list column. - - ''' - def shuffle(self, seed: int | None = ...) -> Self: - ''' - Shuffle the contents of this expression. - - Parameters - ---------- - seed - Seed for the random number generator. If set to None (default), a - random seed is generated each time the shuffle is called. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").shuffle(seed=1)) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - │ 1 │ - │ 3 │ - └─────┘ - - ''' - def sample(self, n: int | IntoExprColumn | None = ...) -> Self: - ''' - Sample from this expression. - - Parameters - ---------- - n - Number of items to return. Cannot be used with `fraction`. Defaults to 1 if - `fraction` is None. - fraction - Fraction of items to return. Cannot be used with `n`. - with_replacement - Allow values to be sampled more than once. - shuffle - Shuffle the order of sampled data points. - seed - Seed for the random number generator. If set to None (default), a - random seed is generated for each sample operation. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 3 │ - │ 1 │ - │ 1 │ - └─────┘ - - ''' - def ewm_mean(self) -> Self: - ''' - Exponentially-weighted moving average. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").ewm_mean(com=1)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.0 │ - │ 1.666667 │ - │ 2.428571 │ - └──────────┘ - - ''' - def ewm_std(self) -> Self: - ''' - Exponentially-weighted moving standard deviation. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").ewm_std(com=1)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.0 │ - │ 0.707107 │ - │ 0.963624 │ - └──────────┘ - - ''' - def ewm_var(self) -> Self: - ''' - Exponentially-weighted moving variance. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").ewm_var(com=1)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.0 │ - │ 0.5 │ - │ 0.928571 │ - └──────────┘ - - ''' - def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: - ''' - Extremely fast method for extending the Series with \'n\' copies of a value. - - Parameters - ---------- - value - A constant literal value (not an expression) with which to extend the - expression result Series; can pass None to extend with nulls. - n - The number of additional values that will be added. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1, 2, 3]}) - >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) - shape: (5, 1) - ┌────────┐ - │ values │ - │ --- │ - │ i64 │ - ╞════════╡ - │ 0 │ - │ 1 │ - │ 2 │ - │ 99 │ - │ 99 │ - └────────┘ - - ''' - def value_counts(self) -> Self: - ''' - Count the occurrences of unique values. - - Parameters - ---------- - sort - Sort the output by count in descending order. - If set to `False` (default), the order of the output is random. - parallel - Execute the computation in parallel. - - .. note:: - This option should likely not be enabled in a group by context, - as the computation is already parallelized per group. - - Returns - ------- - Expr - Expression of data type :class:`Struct` with mapping of unique values to - their count. - - Examples - -------- - >>> df = pl.DataFrame( - ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} - ... ) - >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT - shape: (3, 1) - ┌─────────────┐ - │ color │ - │ --- │ - │ struct[2] │ - ╞═════════════╡ - │ {"red",2} │ - │ {"green",1} │ - │ {"blue",3} │ - └─────────────┘ - - Sort the output by count. - - >>> df.select(pl.col("color").value_counts(sort=True)) - shape: (3, 1) - ┌─────────────┐ - │ color │ - │ --- │ - │ struct[2] │ - ╞═════════════╡ - │ {"blue",3} │ - │ {"red",2} │ - │ {"green",1} │ - └─────────────┘ - - ''' - def unique_counts(self) -> Self: - ''' - Return a count of the unique values in the order of appearance. - - This method differs from `value_counts` in that it does not return the - values, only the counts and might be faster - - Examples - -------- - >>> df = pl.DataFrame( - ... { - ... "id": ["a", "b", "b", "c", "c", "c"], - ... } - ... ) - >>> df.select( - ... [ - ... pl.col("id").unique_counts(), - ... ] - ... ) - shape: (3, 1) - ┌─────┐ - │ id │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - ''' - def log(self, base: float = ...) -> Self: - ''' - Compute the logarithm to a given base. - - Parameters - ---------- - base - Given base, defaults to `e` - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").log(base=2)) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.0 │ - │ 1.0 │ - │ 1.584963 │ - └──────────┘ - - ''' - def log1p(self) -> Self: - ''' - Compute the natural logarithm of each element plus one. - - This computes `log(1 + x)` but is more numerically stable for `x` close to zero. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").log1p()) - shape: (3, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 0.693147 │ - │ 1.098612 │ - │ 1.386294 │ - └──────────┘ - - ''' - def entropy(self, base: float = ...) -> Self: - ''' - Computes the entropy. - - Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. - - Parameters - ---------- - base - Given base, defaults to `e` - normalize - Normalize pk if it doesn\'t sum to 1. - - Examples - -------- - >>> df = pl.DataFrame({"a": [1, 2, 3]}) - >>> df.select(pl.col("a").entropy(base=2)) - shape: (1, 1) - ┌──────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞══════════╡ - │ 1.459148 │ - └──────────┘ - >>> df.select(pl.col("a").entropy(base=2, normalize=False)) - shape: (1, 1) - ┌───────────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═══════════╡ - │ -6.754888 │ - └───────────┘ - - ''' - def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: - ''' - Run an expression over a sliding window that increases `1` slot every iteration. - - Parameters - ---------- - expr - Expression to evaluate - min_periods - Number of valid values there should be in the window before the expression - is evaluated. valid values = `length - null_count` - parallel - Run in parallel. Don\'t do this in a group by or another operation that - already has much parallelization. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - This can be really slow as it can have `O(n^2)` complexity. Don\'t use this - for operations that visit all elements. - - Examples - -------- - >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) - >>> df.select( - ... [ - ... pl.col("values").cumulative_eval( - ... pl.element().first() - pl.element().last() ** 2 - ... ) - ... ] - ... ) - shape: (5, 1) - ┌────────┐ - │ values │ - │ --- │ - │ f64 │ - ╞════════╡ - │ 0.0 │ - │ -3.0 │ - │ -8.0 │ - │ -15.0 │ - │ -24.0 │ - └────────┘ - - ''' - def set_sorted(self) -> Self: - ''' - Flags the expression as \'sorted\'. - - Enables downstream code to user fast paths for sorted arrays. - - Parameters - ---------- - descending - Whether the `Series` order is descending. - - Warnings - -------- - This can lead to incorrect results if this `Series` is not sorted!! - Use with care! - - Examples - -------- - >>> df = pl.DataFrame({"values": [1, 2, 3]}) - >>> df.select(pl.col("values").set_sorted().max()) - shape: (1, 1) - ┌────────┐ - │ values │ - │ --- │ - │ i64 │ - ╞════════╡ - │ 3 │ - └────────┘ - - ''' - def shrink_dtype(self) -> Self: - ''' - Shrink numeric columns to the minimal required datatype. - - Shrink to the dtype needed to fit the extrema of this [`Series`]. - This can be used to reduce memory pressure. - - Examples - -------- - >>> pl.DataFrame( - ... { - ... "a": [1, 2, 3], - ... "b": [1, 2, 2 << 32], - ... "c": [-1, 2, 1 << 30], - ... "d": [-112, 2, 112], - ... "e": [-112, 2, 129], - ... "f": ["a", "b", "c"], - ... "g": [0.1, 1.32, 0.12], - ... "h": [True, None, False], - ... } - ... ).select(pl.all().shrink_dtype()) - shape: (3, 8) - ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ - ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ - │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ - │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ - │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ - └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ - - ''' - def cache(self) -> Self: - """ - Cache this expression so that it only is executed once per context. - - .. deprecated:: 0.18.9 - This method now does nothing. It has been superseded by the - `comm_subexpr_elim` setting on `LazyFrame.collect`, which automatically - caches expressions that are equal. - - """ - def replace(self, mapping: dict[Any, Any]) -> Self: - ''' - Replace values according to the given mapping. - - Needs a global string cache for lazily evaluated queries on columns of - type `Categorical`. - - Parameters - ---------- - mapping - Mapping of values to their replacement. - default - Value to use when the mapping does not contain the lookup value. - Defaults to keeping the original value. Accepts expression input. - Non-expression inputs are parsed as literals. - return_dtype - Set return dtype to override automatic return dtype determination. - - See Also - -------- - str.replace - - Examples - -------- - Replace a single value by another value. Values not in the mapping remain - unchanged. - - >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) - >>> df.with_columns(pl.col("a").replace({2: 100}).alias("replaced")) - shape: (4, 2) - ┌─────┬──────────┐ - │ a ┆ replaced │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════════╡ - │ 1 ┆ 1 │ - │ 2 ┆ 100 │ - │ 2 ┆ 100 │ - │ 3 ┆ 3 │ - └─────┴──────────┘ - - Replace multiple values. Specify a default to set values not in the given map - to the default value. - - >>> df = pl.DataFrame({"country_code": ["FR", "ES", "DE", None]}) - >>> country_code_map = { - ... "CA": "Canada", - ... "DE": "Germany", - ... "FR": "France", - ... None: "unspecified", - ... } - >>> df.with_columns( - ... pl.col("country_code") - ... .replace(country_code_map, default=None) - ... .alias("replaced") - ... ) - shape: (4, 2) - ┌──────────────┬─────────────┐ - │ country_code ┆ replaced │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞══════════════╪═════════════╡ - │ FR ┆ France │ - │ ES ┆ null │ - │ DE ┆ Germany │ - │ null ┆ unspecified │ - └──────────────┴─────────────┘ - - The return type can be overridden with the `return_dtype` argument. - - >>> df = df.with_row_count() - >>> df.select( - ... "row_nr", - ... pl.col("row_nr") - ... .replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) - ... .alias("replaced"), - ... ) - shape: (4, 2) - ┌────────┬──────────┐ - │ row_nr ┆ replaced │ - │ --- ┆ --- │ - │ u32 ┆ u8 │ - ╞════════╪══════════╡ - │ 0 ┆ 0 │ - │ 1 ┆ 10 │ - │ 2 ┆ 20 │ - │ 3 ┆ 0 │ - └────────┴──────────┘ - - To reference other columns as a `default` value, a struct column must be - constructed first. The first field must be the column in which values are - replaced. The other columns can be used in the default expression. - - >>> df.with_columns( - ... pl.struct("country_code", "row_nr") - ... .replace( - ... mapping=country_code_map, - ... default=pl.col("row_nr").cast(pl.Utf8), - ... ) - ... .alias("replaced") - ... ) - shape: (4, 3) - ┌────────┬──────────────┬─────────────┐ - │ row_nr ┆ country_code ┆ replaced │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ str ┆ str │ - ╞════════╪══════════════╪═════════════╡ - │ 0 ┆ FR ┆ France │ - │ 1 ┆ ES ┆ 1 │ - │ 2 ┆ DE ┆ Germany │ - │ 3 ┆ null ┆ unspecified │ - └────────┴──────────────┴─────────────┘ - ''' - def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: - """ - Apply a custom python function to a Series or sequence of Series. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Expr.map_batches`. - - Parameters - ---------- - function - Lambda/ function to apply. - return_dtype - Dtype of the output Series. - agg_list - Aggregate list - - """ - def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - """ - Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Expr.map_elements`. - - Parameters - ---------- - function - Lambda/ function to apply. - return_dtype - Dtype of the output Series. - If not set, the dtype will be - `polars.Unknown`. - skip_nulls - Don't apply the function over values - that contain nulls. This is faster. - pass_name - Pass the Series name to the custom function - This is more expensive. - strategy : {'thread_local', 'threading'} - This functionality is in `alpha` stage. This may be removed - /changed without it being considered a breaking change. - - - 'thread_local': run the python function on a single thread. - - 'threading': run the python function on separate threads. Use with - care as this can slow performance. This might only speed up - your code if the amount of work per element is significant - and the python function releases the GIL (e.g. via calling - a c function) - - """ - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: - """ - Apply a custom rolling window function. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Expr.rolling_map`. - - Parameters - ---------- - function - Aggregation function - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - """ - def is_first(self) -> Self: - """ - Return a boolean mask indicating the first occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Expr.is_first_distinct`. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - """ - def is_last(self) -> Self: - """ - Return a boolean mask indicating the last occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Expr.is_last_distinct`. - - Returns - ------- - Expr - Expression of data type :class:`Boolean`. - - """ - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: - """ - Clip (limit) the values in an array to a `min` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - lower_bound - Lower bound. - - """ - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: - """ - Clip (limit) the values in an array to a `max` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - upper_bound - Upper bound. - - """ - def shift_and_fill(self, fill_value: IntoExpr) -> Self: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - Fill None values with the result of this expression. - n - Number of places to shift (may be negative). - - """ - def register_plugin(self) -> Self: - """ - Register a shared library as a plugin. - - .. warning:: - This is highly unsafe as this will call the C function - loaded by `lib::symbol`. - - The parameters you give dictate how polars will deal - with the function. Make sure they are correct! - - .. note:: - This functionality is unstable and may change without it - being considered breaking. - - Parameters - ---------- - lib - Library to load. - symbol - Function to load. - args - Arguments (other than self) passed to this function. - These arguments have to be of type Expression. - kwargs - Non-expression arguments. They must be JSON serializable. - is_elementwise - If the function only operates on scalars - this will trigger fast paths. - input_wildcard_expansion - Expand expressions as input of this function. - returns_scalar - Automatically explode on unit length if it ran as final aggregation. - this is the case for aggregations like `sum`, `min`, `covariance` etc. - cast_to_supertypes - Cast the input datatypes to their supertype. - pass_name_to_apply - if set, then the `Series` passed to the function in the group_by operation - will ensure the name is set. This is an extra heap allocation per group. - changes_length - For example a `unique` or a `slice` - - """ - def _register_plugin(self) -> Self: ... - def take_every(self, n: int) -> Self: - """ - Take every nth value in the Series and return as a new Series. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: - """ - Take values by index. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather`. - - Parameters - ---------- - indices - An expression that leads to a UInt32 dtyped Series. - """ - def cumsum(self) -> Self: - """ - Get an array with the cumulative sum computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_sum`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cumprod(self) -> Self: - """ - Get an array with the cumulative product computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_prod`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cummin(self) -> Self: - """ - Get an array with the cumulative min computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_min`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cummax(self) -> Self: - """ - Get an array with the cumulative max computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_max`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def cumcount(self) -> Self: - """ - Get an array with the cumulative count computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_count`. - - Parameters - ---------- - reverse - Reverse the operation. - """ - def map_dict(self, mapping: dict[Any, Any]) -> Self: - """ - Replace values in column according to remapping dictionary. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`replace`. The default behavior - has changed to keep any values not present in the mapping unchanged. - Pass `default=None` to keep existing behavior. - - Parameters - ---------- - mapping - Dictionary containing the before/after values to map. - default - Value to use when the remapping dict does not contain the lookup value. - Accepts expression input. Non-expression inputs are parsed as literals. - Use `pl.first()`, to keep the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - - """ - @property - def bin(self): ... - @property - def cat(self): ... - @property - def dt(self): ... - @property - def list(self): ... - @property - def arr(self): ... - @property - def meta(self): ... - @property - def name(self): ... - @property - def str(self): ... - @property - def struct(self): ... -def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: - """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" -def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/expr/expr.pyi new file mode 100644 index 0000000..8a218c0 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/expr/expr.pyi @@ -0,0 +1,8462 @@ +#: version 0.20.3 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Int64 as Int64 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import _warn_null_comparison as _warn_null_comparison, no_default as no_default, sphinx_accessor as sphinx_accessor +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _to_pyexpr(self, other: Any) -> PyExpr: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __radd__(self, other: Any) -> Self: ... + def __and__(self, other: Expr | int | bool) -> Self: ... + def __rand__(self, other: Any) -> Self: ... + def __eq__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __rfloordiv__(self, other: Any) -> Self: ... + def __ge__(self, other: Any) -> Self: ... + def __gt__(self, other: Any) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: Any) -> Self: ... + def __lt__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __rmod__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __rmul__(self, other: Any) -> Self: ... + def __ne__(self, other: Any) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: Expr | int | bool) -> Self: ... + def __ror__(self, other: Any) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, power: int | float | Series | Expr) -> Self: ... + def __rpow__(self, base: int | float | Expr) -> Expr: ... + def __sub__(self, other: Any) -> Self: ... + def __rsub__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __rtruediv__(self, other: Any) -> Self: ... + def __xor__(self, other: Expr | int | bool) -> Self: ... + def __rxor__(self, other: Any) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + + ''' + def any(self) -> Self: + ''' + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def all(self) -> Self: + ''' + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.map`. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + keep_name + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.prefix`. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.suffix`. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.keep`. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).name.keep()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with `^` and end with `$`. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns( + ... pl.all().is_not_null().name.suffix("_not_null") # nan != null + ... ) + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + + ''' + def count(self) -> Self: + ''' + Return the number of non-null elements in the column. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + See Also + -------- + len + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 2 │ + └─────┴─────┘ + ''' + def len(self) -> Self: + ''' + Return the number of elements in the column. + + Null values count towards the total. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + See Also + -------- + count + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + + ''' + def cum_sum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_sum().alias("cum_sum"), + ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_sum ┆ cum_sum_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 10 │ + │ 2 ┆ 3 ┆ 9 │ + │ 3 ┆ 6 ┆ 7 │ + │ 4 ┆ 10 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_sum().alias("value_cum_sum"), + ... pl.col("values") + ... .cum_sum() + ... .forward_fill() + ... .alias("value_cum_sum_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬───────────────┬──────────────────────────┐ + │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═══════════════╪══════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴───────────────┴──────────────────────────┘ + + ''' + def cum_prod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_prod().alias("cum_prod"), + ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), + ... ) + shape: (4, 3) + ┌─────┬──────────┬──────────────────┐ + │ a ┆ cum_prod ┆ cum_prod_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════════╪══════════════════╡ + │ 1 ┆ 1 ┆ 24 │ + │ 2 ┆ 2 ┆ 24 │ + │ 3 ┆ 6 ┆ 12 │ + │ 4 ┆ 24 ┆ 4 │ + └─────┴──────────┴──────────────────┘ + + ''' + def cum_min(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_min().alias("cum_min"), + ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_min ┆ cum_min_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 1 ┆ 3 │ + │ 4 ┆ 1 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + ''' + def cum_max(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_max().alias("cum_max"), + ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_max ┆ cum_max_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 2 ┆ 4 │ + │ 3 ┆ 3 ┆ 4 │ + │ 4 ┆ 4 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_max().alias("cum_max"), + ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬─────────┬────────────────────┐ + │ values ┆ cum_max ┆ cum_max_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════════╪════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴─────────┴────────────────────┘ + + ''' + def cum_count(self) -> Self: + ''' + Get an array with the cumulative count computed at every element. + + Counting from 0 to len + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_count().alias("cum_count"), + ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), + ... ) + shape: (4, 3) + ┌─────┬───────────┬───────────────────┐ + │ a ┆ cum_count ┆ cum_count_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ u32 ┆ u32 │ + ╞═════╪═══════════╪═══════════════════╡ + │ 1 ┆ 0 ┆ 3 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 2 ┆ 1 │ + │ 4 ┆ 3 ┆ 0 │ + └─────┴───────────┴───────────────────┘ + + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + + ''' + def round_sig_figs(self, digits: int) -> Self: + ''' + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) + >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) + shape: (3, 2) + ┌─────────┬────────────────┐ + │ a ┆ round_sig_figs │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════════╪════════════════╡ + │ 0.01234 ┆ 0.012 │ + │ 3.333 ┆ 3.3 │ + │ 1234.0 ┆ 1200.0 │ + └─────────┴────────────────┘ + + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + See Also + -------- + Expr.get : Take a single value + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg( + ... pl.col("value").gather([2, 1]) + ... ) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ one ┆ [2, 98] │ + │ two ┆ [4, 99] │ + └───────┴───────────┘ + ''' + def get(self, index: int | Expr) -> Self: + ''' + Return a single value by index. + + Parameters + ---------- + index + An expression that leads to a UInt32 index. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns(shift=pl.col("a").shift()) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴───────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.with_columns(shift=pl.col("a").shift(-2)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ null │ + │ 4 ┆ null │ + └─────┴───────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ 100 │ + │ 4 ┆ 100 │ + └─────┴───────┘ + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().name.suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns( + ... pl.col("c").max().over("a").name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns( + ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns( + ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns( + ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + ''' + def rolling(self, index_column: str) -> Self: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + + ''' + def rle(self) -> Self: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn], **constraints: Any) -> Self: + ''' + Filter the expression based on one or more predicate expressions. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicates + Expression(s) that evaluates to a boolean Series. + constraints + Column filters; use `name = value` to filter columns by the supplied value. + Each constraint will behave the same as `pl.col(name).eq(value)`, and + will be implicitly joined with the other filter conditions using `&`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), + ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + Filter expressions can also take constraints as keyword arguments. + + >>> import polars.selectors as cs + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "a", "a", "a", "b", "b", "b", "b", "b"], + ... "n": [1, 2, 2, 3, 1, 3, 3, 2, 3], + ... }, + ... ) + >>> df.group_by("key").agg( + ... n_1=pl.col("n").filter(n=1).sum(), + ... n_2=pl.col("n").filter(n=2).sum(), + ... n_3=pl.col("n").filter(n=3).sum(), + ... ).sort(by="key") + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ key ┆ n_1 ┆ n_2 ┆ n_3 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 4 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 9 │ + └─────┴─────┴─────┴─────┘ + + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series. If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for `map` functions is transforming the values + represented by an expression using a third-party library. + + .. warning:: + If you are looking to map a function over a window function or group_by + context, refer to :func:`map_elements` instead. + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + is_elementwise + If set to true this can run in the streaming engine, but may yield + incorrect results in group-by. Ensure you know what you are doing! + agg_list + Aggregate list. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_elements + replace + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type `Callable[[Any], Any]`. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type `Callable[[Series], Any]`. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be `pl.Unknown`. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using `map_elements` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using `over` is considered a GroupBy context + here, so `map_elements` can be used to map functions over window groups. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using `over` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort("key") # doctest: +IGNORE_RESULT + + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + + ''' + def gather_every(self, n: int, offset: int = ...) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").gather_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + >>> df.select(pl.col("foo").gather_every(3, offset=1)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 5 │ + │ 8 │ + └─────┘ + + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator `expr & other & ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator `expr | other | ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other` where `None == None`. + + This differs from default `eq` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator `expr >= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ true │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator `expr > other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator `expr <= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator `expr < other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator `expr != other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr != other` where `None == None`. + + This differs from default `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator `expr + other`. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator `expr // other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator `expr % other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator `expr * other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator `expr - other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator `expr / other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + + ''' + def pow(self, exponent: int | float | None | Series | Expr) -> Self: + ''' + Method equivalent of exponentiation operator `expr ** exponent`. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator `expr ^ other`. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) + shape: (3, 3) + ┌───────────┬──────────────────┬──────────┐ + │ sets ┆ optional_members ┆ contains │ + │ --- ┆ --- ┆ --- │ + │ list[i64] ┆ i64 ┆ bool │ + ╞═══════════╪══════════════════╪══════════╡ + │ [1, 2, 3] ┆ 1 ┆ true │ + │ [1, 2] ┆ 2 ┆ true │ + │ [9, 10] ┆ 3 ┆ false │ + └───────────┴──────────────────┴──────────┘ + + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given start and end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with `lit` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ NaN │ + │ 3.0 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("row_nr").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("row_nr").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └────────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬──────────────────┐ + │ row_nr ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └────────┴─────────────────────┴──────────────────┘ + + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("row_nr").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_count() + >>> df_temporal + shape: (25, 2) + ┌────────┬─────────────────────┐ + │ row_nr ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞════════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └────────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └────────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("row_nr").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌────────┬─────────────────────┬─────────────────┐ + │ row_nr ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞════════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └────────┴─────────────────────┴─────────────────┘ + + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: + ''' + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) + >>> df.with_columns(clip=pl.col("a").clip(1, 10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + Specifying only a single bound: + + >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + + ''' + def cot(self) -> Self: + ''' + Compute the element-wise value for the cotangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cot().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 0.64 │ + └──────┘ + + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + + ''' + def sample(self, n: int | IntoExprColumn | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + + ''' + def ewm_mean(self) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + + ''' + def ewm_std(self) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + + ''' + def ewm_var(self) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + + ''' + def value_counts(self) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + + ''' + def hist(self, bins: IntoExpr | None = ...) -> Self: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + include_breakpoint + Include a column that indicates the upper breakpoint. + include_category + Include a column that shows the intervals as categories. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 3, 8, 8, 2, 1, 3]}) + >>> df.select(pl.col("a").hist(bins=[1, 2, 3])) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 2 │ + │ 2 │ + └─────┘ + >>> df.select( + ... pl.col("a").hist( + ... bins=[1, 2, 3], include_breakpoint=True, include_category=True + ... ) + ... ) + shape: (4, 1) + ┌───────────────────────┐ + │ a │ + │ --- │ + │ struct[3] │ + ╞═══════════════════════╡ + │ {1.0,"(-inf, 1.0]",2} │ + │ {2.0,"(1.0, 2.0]",1} │ + │ {3.0,"(2.0, 3.0]",2} │ + │ {inf,"(3.0, inf]",2} │ + └───────────────────────┘ + + ''' + def replace(self, old: IntoExpr | Sequence[Any] | Mapping[Any, Any], new: IntoExpr | Sequence[Any] | NoDefault = ...) -> Self: + ''' + Replace values by different values. + + Parameters + ---------- + old + Value or sequence of values to replace. + Accepts expression input. Sequences are parsed as Series, + other non-expression inputs are parsed as literals. + Also accepts a mapping of values to their replacement as syntactic sugar for + `replace(new=Series(mapping.keys()), old=Series(mapping.values()))`. + new + Value or sequence of values to replace by. + Accepts expression input. Sequences are parsed as Series, + other non-expression inputs are parsed as literals. + Length must match the length of `old` or have length 1. + default + Set values that were not replaced to this value. + Defaults to keeping the original value. + Accepts expression input. Non-expression inputs are parsed as literals. + return_dtype + The data type of the resulting expression. If set to `None` (default), + the data type is determined automatically based on the other inputs. + + See Also + -------- + str.replace + + Notes + ----- + The global string cache must be enabled when replacing categorical values. + + Examples + -------- + Replace a single value by another value. Values that were not replaced remain + unchanged. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) + >>> df.with_columns(replaced=pl.col("a").replace(2, 100)) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 3 │ + └─────┴──────────┘ + + Replace multiple values by passing sequences to the `old` and `new` parameters. + + >>> df.with_columns(replaced=pl.col("a").replace([2, 3], [100, 200])) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 200 │ + └─────┴──────────┘ + + Passing a mapping with replacements is also supported as syntactic sugar. + Specify a default to set all values that were not matched. + + >>> mapping = {2: 100, 3: 200} + >>> df.with_columns(replaced=pl.col("a").replace(mapping, default=-1)) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ -1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 200 │ + └─────┴──────────┘ + + Replacing by values of a different data type sets the return type based on + a combination of the `new` data type and either the original data type or the + default data type if it was set. + + >>> df = pl.DataFrame({"a": ["x", "y", "z"]}) + >>> mapping = {"x": 1, "y": 2, "z": 3} + >>> df.with_columns(replaced=pl.col("a").replace(mapping)) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + >>> df.with_columns(replaced=pl.col("a").replace(mapping, default=None)) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + + Set the `return_dtype` parameter to control the resulting data type directly. + + >>> df.with_columns( + ... replaced=pl.col("a").replace(mapping, return_dtype=pl.UInt8) + ... ) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ u8 │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + + Expression input is supported for all parameters. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3], "b": [1.5, 2.5, 5.0, 1.0]}) + >>> df.with_columns( + ... replaced=pl.col("a").replace( + ... old=pl.col("a").max(), + ... new=pl.col("b").sum(), + ... default=pl.col("b"), + ... ) + ... ) + shape: (4, 3) + ┌─────┬─────┬──────────┐ + │ a ┆ b ┆ replaced │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═════╪══════════╡ + │ 1 ┆ 1.5 ┆ 1.5 │ + │ 2 ┆ 2.5 ┆ 2.5 │ + │ 2 ┆ 5.0 ┆ 5.0 │ + │ 3 ┆ 1.0 ┆ 10.0 │ + └─────┴─────┴──────────┘ + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + + """ + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + `polars.Unknown`. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def register_plugin(self) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by `lib::symbol`. + + The parameters you give dictate how polars will deal + with the function. Make sure they are correct! + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + These arguments have to be of type Expression. + kwargs + Non-expression arguments. They must be JSON serializable. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + returns_scalar + Automatically explode on unit length if it ran as final aggregation. + this is the case for aggregations like `sum`, `min`, `covariance` etc. + cast_to_supertypes + Cast the input datatypes to their supertype. + pass_name_to_apply + if set, then the `Series` passed to the function in the group_by operation + will ensure the name is set. This is an extra heap allocation per group. + changes_length + For example a `unique` or a `slice` + + """ + def _register_plugin(self) -> Self: ... + def take_every(self, n: int, offset: int = ...) -> Self: + """ + Take every nth value in the Series and return as a new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + """ + def cumsum(self) -> Self: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumprod(self) -> Self: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummin(self) -> Self: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummax(self) -> Self: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumcount(self) -> Self: + """ + Get an array with the cumulative count computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_count`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in column according to remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def name(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/lazyframe/frame b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/lazyframe/frame deleted file mode 100644 index 561f5b2..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/lazyframe/frame +++ /dev/null @@ -1,4211 +0,0 @@ -import P -import np -import pa -from builtins import PyLazyFrame -from pathlib import Path -from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Utf8 as Utf8 -from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype -from polars.dependencies import dataframe_api_compat as dataframe_api_compat -from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud -from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte -from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec -from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec -from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy -from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector -from polars.slice import LazyPolarsSlice as LazyPolarsSlice -from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult -from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions -from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr -from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath -from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence - -TYPE_CHECKING: bool -DTYPE_TEMPORAL_UNITS: frozenset -N_INFER_DEFAULT: int - -class LazyFrame: - _accessors: _ClassVar[set] = ... - def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... - @classmethod - def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... - @classmethod - def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: - """ - Lazily read from a CSV file or multiple files via glob patterns. - - Use `pl.scan_csv` to dispatch to this method. - - See Also - -------- - polars.io.scan_csv - - """ - @classmethod - def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: - """ - Lazily read from a parquet file or multiple files via glob patterns. - - Use `pl.scan_parquet` to dispatch to this method. - - See Also - -------- - polars.io.scan_parquet - - """ - @classmethod - def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: - """ - Lazily read from an Arrow IPC (Feather v2) file. - - Use `pl.scan_ipc` to dispatch to this method. - - See Also - -------- - polars.io.scan_ipc - - """ - @classmethod - def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: - """ - Lazily read from a newline delimited JSON file. - - Use `pl.scan_ndjson` to dispatch to this method. - - See Also - -------- - polars.io.scan_ndjson - - """ - @classmethod - def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... - @classmethod - def from_json(cls, json: str) -> Self: - """ - Read a logical plan from a JSON string to construct a LazyFrame. - - .. deprecated:: 0.18.12 - This method is deprecated. Convert the JSON string to `StringIO` - and then use `LazyFrame.deserialize`. - - Parameters - ---------- - json - String in JSON format. - - See Also - -------- - deserialize - - """ - @classmethod - def read_json(cls, source: str | Path | IOBase) -> Self: - """ - Read a logical plan from a JSON file to construct a LazyFrame. - - .. deprecated:: 0.18.12 - This class method has been renamed to `deserialize`. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - - See Also - -------- - deserialize - - """ - @classmethod - def deserialize(cls, source: str | Path | IOBase) -> Self: - ''' - Read a logical plan from a JSON file to construct a LazyFrame. - - Parameters - ---------- - source - Path to a file or a file-like object (by file-like object, we refer to - objects that have a `read()` method, such as a file handler (e.g. - via builtin `open` function) or `BytesIO`). - - See Also - -------- - LazyFrame.serialize - - Examples - -------- - >>> import io - >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() - >>> json = lf.serialize() - >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 6 │ - └─────┘ - - ''' - def __dataframe_consortium_standard__(self) -> Any: - """ - Provide entry point to the Consortium DataFrame Standard API. - - This is developed and maintained outside of polars. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. - """ - def __bool__(self) -> NoReturn: ... - def _comparison_error(self, operator: str) -> NoReturn: ... - def __eq__(self, other: Any) -> NoReturn: ... - def __ne__(self, other: Any) -> NoReturn: ... - def __gt__(self, other: Any) -> NoReturn: ... - def __lt__(self, other: Any) -> NoReturn: ... - def __ge__(self, other: Any) -> NoReturn: ... - def __le__(self, other: Any) -> NoReturn: ... - def __contains__(self, key: str) -> bool: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __getitem__(self, item: int | range | slice) -> LazyFrame: ... - def _repr_html_(self) -> str: ... - def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: - ''' - Serialize the logical plan of this LazyFrame to a file or string in JSON format. - - Parameters - ---------- - file - File path to which the result should be written. If set to `None` - (default), the output is returned as a string instead. - - See Also - -------- - LazyFrame.deserialize - - Examples - -------- - Serialize the logical plan into a JSON string. - - >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() - >>> json = lf.serialize() - >>> json - \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' - - The logical plan can later be deserialized back into a LazyFrame. - - >>> import io - >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 6 │ - └─────┘ - - ''' - def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: - """ - Serialize the logical plan of this LazyFrame to a file or string in JSON format. - - .. deprecated:: 0.18.12 - This method has been renamed to :func:`LazyFrame.serialize`. - - Parameters - ---------- - file - File path to which the result should be written. If set to `None` - (default), the output is returned as a string instead. - """ - def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: - ''' - Offers a structured way to apply a sequence of user-defined functions (UDFs). - - Parameters - ---------- - function - Callable; will receive the frame as the first parameter, - followed by any given args/kwargs. - *args - Arguments to pass to the UDF. - **kwargs - Keyword arguments to pass to the UDF. - - Examples - -------- - >>> def cast_str_to_int(data, col_name): - ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) - ... - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": ["10", "20", "30", "40"], - ... } - ... ) - >>> lf.pipe(cast_str_to_int, col_name="b").collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 10 │ - │ 2 ┆ 20 │ - │ 3 ┆ 30 │ - │ 4 ┆ 40 │ - └─────┴─────┘ - - >>> lf = pl.LazyFrame( - ... { - ... "b": [1, 2], - ... "a": [3, 4], - ... } - ... ) - >>> lf.collect() - shape: (2, 2) - ┌─────┬─────┐ - │ b ┆ a │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 4 │ - └─────┴─────┘ - >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 1 │ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def explain(self) -> str: - ''' - Create a string representation of the query plan. - - Different optimizations can be turned on or off. - - Parameters - ---------- - optimized - Return an optimized query plan. Defaults to `True`. - If this is set to `True` the subsequent - optimization flags control which optimizations - run. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( - ... "a" - ... ).explain() # doctest: +SKIP - ''' - def show_graph(self) -> str | None: - ''' - Show a plot of the query plan. Note that you should have graphviz installed. - - Parameters - ---------- - optimized - Optimize the query plan. - show - Show the figure. - output_path - Write the figure to disk. - raw_output - Return dot syntax. This cannot be combined with `show` and/or `output_path`. - figsize - Passed to matplotlib if `show` == True. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( - ... "a" - ... ).show_graph() # doctest: +SKIP - - ''' - def inspect(self, fmt: str = ...) -> Self: - ''' - Inspect a node in the computation graph. - - Print the value that this node in the computation graph evaluates to and passes - on the value. - - Examples - -------- - >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) - >>> ( - ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) - ... .inspect() # print the node before the filter - ... .filter(pl.col("bar") == pl.col("foo")) - ... ) # doctest: +ELLIPSIS - - - ''' - def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: - ''' - Sort the DataFrame by the given columns. - - Parameters - ---------- - by - Column(s) to sort by. Accepts expression input. Strings are parsed as column - names. - *more_by - Additional columns to sort by, specified as positional arguments. - descending - Sort in descending order. When sorting by multiple columns, can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - Examples - -------- - Pass a single column name to sort by that column. - - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, None], - ... "b": [6.0, 5.0, 4.0], - ... "c": ["a", "c", "b"], - ... } - ... ) - >>> lf.sort("a").collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - Sorting by expressions is also supported. - - >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - └──────┴─────┴─────┘ - - Sort by multiple columns by passing a list of columns. - - >>> lf.sort(["c", "a"], descending=True).collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 2 ┆ 5.0 ┆ c │ - │ null ┆ 4.0 ┆ b │ - │ 1 ┆ 6.0 ┆ a │ - └──────┴─────┴─────┘ - - Or use positional arguments to sort by multiple columns in the same way. - - >>> lf.sort("c", "a", descending=[False, True]).collect() - shape: (3, 3) - ┌──────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞══════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ null ┆ 4.0 ┆ b │ - │ 2 ┆ 5.0 ┆ c │ - └──────┴─────┴─────┘ - - ''' - def top_k(self, k: int) -> Self: - ''' - Return the `k` largest elements. - - If \'descending=True` the smallest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might - be worse since this requires a stable search. - - See Also - -------- - bottom_k - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 largest values in column b. - - >>> lf.top_k(4, by="b").collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ a ┆ 2 │ - │ b ┆ 2 │ - │ b ┆ 1 │ - └─────┴─────┘ - - Get the rows which contain the 4 largest values when sorting on column b and a. - - >>> lf.top_k(4, by=["b", "a"]).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 2 │ - │ c ┆ 1 │ - └─────┴─────┘ - - ''' - def bottom_k(self, k: int) -> Self: - ''' - Return the `k` smallest elements. - - If \'descending=True` the largest elements will be given. - - Parameters - ---------- - k - Number of rows to return. - by - Column(s) included in sort order. Accepts expression input. - Strings are parsed as column names. - descending - Return the \'k\' smallest. Top-k by multiple columns can be specified - per column by passing a sequence of booleans. - nulls_last - Place null values last. - maintain_order - Whether the order should be maintained if elements are equal. - Note that if `true` streaming is not possible and performance might be - worse since this requires a stable search. - - See Also - -------- - top_k - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [2, 1, 1, 3, 2, 1], - ... } - ... ) - - Get the rows which contain the 4 smallest values in column b. - - >>> lf.bottom_k(4, by="b").collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ b ┆ 1 │ - │ a ┆ 1 │ - │ c ┆ 1 │ - │ a ┆ 2 │ - └─────┴─────┘ - - Get the rows which contain the 4 smallest values when sorting on column a and b. - - >>> lf.bottom_k(4, by=["a", "b"]).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ b ┆ 1 │ - │ b ┆ 2 │ - └─────┴─────┘ - - ''' - def profile(self) -> tuple[DataFrame, DataFrame]: - ''' - Profile a LazyFrame. - - This will run the query and return a tuple - containing the materialized DataFrame and a DataFrame that - contains profiling information of each node that is executed. - - The units of the timings are microseconds. - - Parameters - ---------- - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off (certain) optimizations. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - show_plot - Show a gantt chart of the profiling result - truncate_nodes - Truncate the label lengths in the gantt chart to this number of - characters. - figsize - matplotlib figsize of the profiling plot - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( - ... "a" - ... ).profile() # doctest: +SKIP - (shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘, - shape: (3, 3) - ┌─────────────────────────┬───────┬──────┐ - │ node ┆ start ┆ end │ - │ --- ┆ --- ┆ --- │ - │ str ┆ u64 ┆ u64 │ - ╞═════════════════════════╪═══════╪══════╡ - │ optimization ┆ 0 ┆ 5 │ - │ group_by_partitioned(a) ┆ 5 ┆ 470 │ - │ sort(a) ┆ 475 ┆ 1964 │ - └─────────────────────────┴───────┴──────┘) - - ''' - def collect(self) -> DataFrame: - ''' - Materialize this LazyFrame into a DataFrame. - - By default, all query optimizations are enabled. Individual optimizations may - be disabled by setting the corresponding parameter to `False`. - - Parameters - ---------- - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - no_optimization - Turn off (certain) optimizations. - streaming - Process the query in batches to handle larger-than-memory data. - If set to `False` (default), the entire query is processed in a single - batch. - - .. warning:: - This functionality is currently in an alpha state. - - .. note:: - Use :func:`explain` to see if Polars can process the query in streaming - mode. - - Returns - ------- - DataFrame - - See Also - -------- - fetch: Run the query on the first `n` rows only for debugging purposes. - explain : Print the query plan that is evaluated with collect. - profile : Collect the LazyFrame and time each node in the computation graph. - polars.collect_all : Collect multiple LazyFrames at the same time. - polars.Config.set_streaming_chunk_size : Set the size of streaming batches. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘ - - Collect in streaming mode - - >>> lf.group_by("a").agg(pl.all().sum()).collect( - ... streaming=True - ... ) # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘ - - ''' - def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: - ''' - Collect DataFrame asynchronously in thread pool. - - Collects into a DataFrame (like :func:`collect`), but instead of returning - DataFrame directly, they are scheduled to be collected inside thread pool, - while this method returns almost instantly. - - May be useful if you use gevent or asyncio and want to release control to other - greenlets/tasks while LazyFrames are being collected. - - Parameters - ---------- - gevent - Return wrapper to `gevent.event.AsyncResult` instead of Awaitable - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off (certain) optimizations. - slice_pushdown - Slice pushdown optimization. - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Notes - ----- - In case of error `set_exception` is used on - `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - See Also - -------- - polars.collect_all : Collect multiple LazyFrames at the same time. - polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. - - Returns - ------- - If `gevent=False` (default) then returns awaitable. - - If `gevent=True` then returns wrapper that has - `.get(block=True, timeout=None)` method. - - Examples - -------- - >>> import asyncio - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> async def main(): - ... return await ( - ... lf.group_by("a", maintain_order=True) - ... .agg(pl.all().sum()) - ... .collect_async() - ... ) - ... - >>> asyncio.run(main()) - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 4 ┆ 10 │ - │ b ┆ 11 ┆ 10 │ - │ c ┆ 6 ┆ 1 │ - └─────┴─────┴─────┘ - ''' - def sink_parquet(self, path: str | Path) -> DataFrame: - ''' - Evaluate the query in streaming mode and write to a Parquet file. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} - Choose "zstd" for good compression performance. - Choose "lz4" for fast compression/decompression. - Choose "snappy" for more backwards compatibility guarantees - when you deal with older parquet readers. - compression_level - The level of compression to use. Higher compression means smaller files on - disk. - - - "gzip" : min-level: 0, max-level: 10. - - "brotli" : min-level: 0, max-level: 11. - - "zstd" : min-level: 1, max-level: 22. - statistics - Write statistics to the parquet headers. This requires extra compute. - row_group_size - Size of the row groups in number of rows. - If None (default), the chunks of the `DataFrame` are - used. Writing in smaller chunks may reduce memory pressure and improve - writing speeds. - data_pagesize_limit - Size limit of individual data pages. - If not set defaults to 1024 * 1024 bytes - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - no_optimization - Turn off (certain) optimizations. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_parquet("out.parquet") # doctest: +SKIP - - ''' - def sink_ipc(self, path: str | Path) -> DataFrame: - ''' - Evaluate the query in streaming mode and write to an IPC file. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - compression : {\'lz4\', \'zstd\'} - Choose "zstd" for good compression performance. - Choose "lz4" for fast compression/decompression. - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - no_optimization - Turn off (certain) optimizations. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_ipc("out.arrow") # doctest: +SKIP - - ''' - def sink_csv(self, path: str | Path) -> DataFrame: - ''' - Evaluate the query in streaming mode and write to a CSV file. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - include_bom - Whether to include UTF-8 BOM in the CSV output. - include_header - Whether to include header in the CSV output. - separator - Separate CSV fields with this symbol. - line_terminator - String used to end each row. - quote_char - Byte to use as quoting character. - batch_size - Number of rows that will be processed per thread. - datetime_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. If no format specified, the default fractional-second - precision is inferred from the maximum timeunit found in the frame\'s - Datetime cols (if any). - date_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - time_format - A format string, with the specifiers defined by the - `chrono `_ - Rust crate. - float_precision - Number of decimal places to write, applied to both `Float32` and - `Float64` datatypes. - null_value - A string representing null values (defaulting to the empty string). - quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} - Determines the quoting strategy used. - - - necessary (default): This puts quotes around fields only when necessary. - They are necessary when fields contain a quote, - delimiter or record terminator. - Quotes are also necessary when writing an empty record - (which is indistinguishable from a record with one empty field). - This is the default. - - always: This puts quotes around every field. Always. - - never: This never puts quotes around fields, even if that results in - invalid CSV data (e.g.: by not quoting strings containing the - separator). - - non_numeric: This puts quotes around all fields that are non-numeric. - Namely, when writing a field that does not parse as a valid float - or integer, then quotes will be used even if they aren`t strictly - necessary. - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - slice_pushdown - Slice pushdown optimization. - no_optimization - Turn off (certain) optimizations. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_csv("out.csv") # doctest: +SKIP - - ''' - def sink_ndjson(self, path: str | Path) -> DataFrame: - ''' - Persists a LazyFrame at the provided path. - - This allows streaming results that are larger than RAM to be written to disk. - - Parameters - ---------- - path - File path to which the file should be written. - maintain_order - Maintain the order in which data is processed. - Setting this to `False` will be slightly faster. - type_coercion - Do type coercion optimization. - predicate_pushdown - Do predicate pushdown optimization. - projection_pushdown - Do projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off (certain) optimizations. - slice_pushdown - Slice pushdown optimization. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP - >>> lf.sink_json("out.json") # doctest: +SKIP - - ''' - def _set_sink_optimizations(self) -> PyLazyFrame: ... - def fetch(self, n_rows: int = ...) -> DataFrame: - ''' - Collect a small number of rows for debugging purposes. - - Parameters - ---------- - n_rows - Collect n_rows from the data sources. - type_coercion - Run type coercion optimization. - predicate_pushdown - Run predicate pushdown optimization. - projection_pushdown - Run projection pushdown optimization. - simplify_expression - Run simplify expressions optimization. - no_optimization - Turn off optimizations. - slice_pushdown - Slice pushdown optimization - comm_subplan_elim - Will try to cache branching subplans that occur on self-joins or unions. - comm_subexpr_elim - Common subexpressions will be cached and reused. - streaming - Run parts of the query in a streaming fashion (this is in an alpha state) - - Notes - ----- - This is similar to a :func:`collect` operation, but it overwrites the number of - rows read by *every* scan operation. Be aware that `fetch` does not guarantee - the final number of rows in the DataFrame. Filters, join operations and fewer - rows being available in the scanned data will all influence the final number - of rows (joins are especially susceptible to this, and may return no data - at all if `n_rows` is too small as the join keys may not be present). - - Warnings - -------- - This is strictly a utility function that can help to debug queries using a - smaller number of rows, and should *not* be used in production code. - - Returns - ------- - DataFrame - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 6 │ - │ b ┆ 2 ┆ 5 │ - └─────┴─────┴─────┘ - - ''' - def lazy(self) -> Self: - ''' - Return lazy representation, i.e. itself. - - Useful for writing code that expects either a :class:`DataFrame` or - :class:`LazyFrame`. - - Returns - ------- - LazyFrame - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> lf.lazy() # doctest: +ELLIPSIS - - - ''' - def cache(self) -> Self: - """Cache the result once the execution of the physical plan hits this node.""" - def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: - ''' - Cast LazyFrame column(s) to the specified dtype(s). - - Parameters - ---------- - dtypes - Mapping of column names (or selector) to dtypes, or a single dtype - to which all columns will be cast. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> from datetime import date - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], - ... } - ... ) - - Cast specific frame columns to the specified dtypes: - - >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ u8 ┆ date │ - ╞═════╪═════╪════════════╡ - │ 1.0 ┆ 6 ┆ 2020-01-02 │ - │ 2.0 ┆ 7 ┆ 2021-03-04 │ - │ 3.0 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - Cast all frame columns to the specified dtype: - - >>> lf.cast(pl.Utf8).collect().to_dict(as_series=False) - {\'foo\': [\'1\', \'2\', \'3\'], - \'bar\': [\'6.0\', \'7.0\', \'8.0\'], - \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} - - Use selectors to define the columns being cast: - - >>> import polars.selectors as cs - >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.Utf8}).collect() - shape: (3, 3) - ┌─────┬─────┬────────────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ str │ - ╞═════╪═════╪════════════╡ - │ 1 ┆ 6 ┆ 2020-01-02 │ - │ 2 ┆ 7 ┆ 2021-03-04 │ - │ 3 ┆ 8 ┆ 2022-05-06 │ - └─────┴─────┴────────────┘ - - ''' - def clear(self, n: int = ...) -> LazyFrame: - ''' - Create an empty copy of the current LazyFrame, with zero to \'n\' rows. - - Returns a copy with an identical schema but no data. - - Parameters - ---------- - n - Number of (empty) rows to return in the cleared frame. - - See Also - -------- - clone : Cheap deepcopy/clone. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> lf.clear().fetch() - shape: (0, 3) - ┌─────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞═════╪═════╪══════╡ - └─────┴─────┴──────┘ - - >>> lf.clear(2).fetch() - shape: (2, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool │ - ╞══════╪══════╪══════╡ - │ null ┆ null ┆ null │ - │ null ┆ null ┆ null │ - └──────┴──────┴──────┘ - - ''' - def clone(self) -> Self: - ''' - Create a copy of this LazyFrame. - - This is a cheap operation that does not copy data. - - See Also - -------- - clear : Create an empty copy of the current LazyFrame, with identical - schema but no data. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, 2, 3, 4], - ... "b": [0.5, None, 2.5, 13], - ... "c": [True, True, False, None], - ... } - ... ) - >>> lf.clone() # doctest: +ELLIPSIS - - - ''' - def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: - ''' - Filter the rows in the LazyFrame based on a predicate expression. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - predicates - Expression that evaluates to a boolean Series. - constraints - Column filters. Use name=value to filter column name by the supplied value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - - Filter on one condition: - - >>> lf.filter(pl.col("foo") > 1).collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - Filter on multiple conditions: - - >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Provide multiple filters using `*args` syntax: - - >>> lf.filter( - ... pl.col("foo") == 1, - ... pl.col("ham") == "a", - ... ).collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Provide multiple filters using `**kwargs` syntax: - - >>> lf.filter(foo=1, ham="a").collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - Filter on an OR condition: - - >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - ''' - Select columns from this LazyFrame. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Examples - -------- - Pass the name of a column to select that column. - - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.select("foo").collect() - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - - Multiple columns can be selected by passing a list of column names. - - >>> lf.select(["foo", "bar"]).collect() - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 6 │ - │ 2 ┆ 7 │ - │ 3 ┆ 8 │ - └─────┴─────┘ - - Multiple columns can also be selected using positional arguments instead of a - list. Expressions are also accepted. - - >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - └─────┴─────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> lf.select( - ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) - ... ).collect() - shape: (3, 1) - ┌───────────┐ - │ threshold │ - │ --- │ - │ i32 │ - ╞═══════════╡ - │ 0 │ - │ 0 │ - │ 10 │ - └───────────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... lf.select( - ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), - ... ).collect() - ... - shape: (3, 1) - ┌───────────┐ - │ is_odd │ - │ --- │ - │ struct[2] │ - ╞═══════════╡ - │ {1,0} │ - │ {0,1} │ - │ {1,0} │ - └───────────┘ - - ''' - def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - """ - Select columns from this LazyFrame. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to select, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, - other non-expression inputs are parsed as literals. - **named_exprs - Additional columns to select, specified as keyword arguments. - The columns will be renamed to the keyword used. - - See Also - -------- - select - - """ - def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: - ''' - Start a group by operation. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Setting this to `True` blocks the possibility - to run on the streaming engine. - - Examples - -------- - Group by one column and call `agg` to compute the grouped sum of another - column. - - >>> lf = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "c"], - ... "b": [1, 2, 1, 3, 3], - ... "c": [5, 4, 3, 2, 1], - ... } - ... ) - >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 2 │ - │ b ┆ 5 │ - │ c ┆ 3 │ - └─────┴─────┘ - - Set `maintain_order=True` to ensure the order of the groups is consistent with - the input. - - >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() - shape: (3, 2) - ┌─────┬───────────┐ - │ a ┆ c │ - │ --- ┆ --- │ - │ str ┆ list[i64] │ - ╞═════╪═══════════╡ - │ a ┆ [5, 3] │ - │ b ┆ [4, 2] │ - │ c ┆ [1] │ - └─────┴───────────┘ - - Group by multiple columns by passing a list of column names. - - >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - - Or use positional arguments to group by multiple columns in the same way. - Expressions are also accepted. - - >>> lf.group_by("a", pl.col("b") // 2).agg( - ... pl.col("c").mean() - ... ).collect() # doctest: +SKIP - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ f64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 0 ┆ 4.0 │ - │ b ┆ 1 ┆ 3.0 │ - │ c ┆ 1 ┆ 1.0 │ - └─────┴─────┴─────┘ - - ''' - def rolling(self, index_column: IntoExpr) -> LazyGroupBy: - ''' - Create rolling groups based on a time, Int32, or Int64 column. - - Different from a `dynamic_group_by` the windows are now determined by the - individual values and are not of constant intervals. For constant intervals - use :func:`LazyFrame.group_by_dynamic`. - - If you have a time series ``, then by default the - windows created will be - - * (t_0 - period, t_0] - * (t_1 - period, t_1] - * ... - * (t_n - period, t_n] - - whereas if you pass a non-default `offset`, then the windows will be - - * (t_0 + offset, t_0 + offset + period] - * (t_1 + offset, t_1 + offset + period] - * ... - * (t_n + offset, t_n + offset + period] - - The `period` and `offset` arguments are created either from a timedelta, or - by using the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a rolling operation on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - group_by_dynamic - - Examples - -------- - >>> dates = [ - ... "2020-01-01 13:45:48", - ... "2020-01-01 16:42:13", - ... "2020-01-01 16:45:09", - ... "2020-01-02 18:12:48", - ... "2020-01-03 19:45:32", - ... "2020-01-08 23:16:43", - ... ] - >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( - ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() - ... ) - >>> out = ( - ... df.rolling(index_column="dt", period="2d") - ... .agg( - ... pl.sum("a").alias("sum_a"), - ... pl.min("a").alias("min_a"), - ... pl.max("a").alias("max_a"), - ... ) - ... .collect() - ... ) - >>> out - shape: (6, 4) - ┌─────────────────────┬───────┬───────┬───────┐ - │ dt ┆ sum_a ┆ min_a ┆ max_a │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ - ╞═════════════════════╪═══════╪═══════╪═══════╡ - │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ - │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ - │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ - │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ - │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ - │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ - └─────────────────────┴───────┴───────┴───────┘ - - ''' - def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - Time windows are calculated and rows are assigned to windows. Different from a - normal group by is that a row can be member of multiple groups. - By default, the windows look like: - - - [start, start + period) - - [start + every, start + every + period) - - [start + 2*every, start + 2*every + period) - - ... - - where `start` is determined by `start_by`, `offset`, and `every` (see parameter - descriptions below). - - .. warning:: - The index column must be sorted in ascending order. If `by` is passed, then - the index column must be sorted in ascending order within each group. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - - .. deprecated:: 0.19.4 - Use `label` instead. - include_boundaries - Add the lower and upper bound of the window to the "_lower_boundary" and - "_upper_boundary" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'left\', \'right\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - label : {\'left\', \'right\', \'datapoint\'} - Define which label to use for the window: - - - \'left\': lower boundary of the window - - \'right\': upper boundary of the window - - \'datapoint\': the first value of the index column in the given window. - If you don\'t need the label to be at one of the boundaries, choose this - option for maximum performance - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - See Also - -------- - rolling - - Notes - ----- - 1) If you\'re coming from pandas, then - - .. code-block:: python - - # polars - df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) - - is equivalent to - - .. code-block:: python - - # pandas - df.set_index("ts").resample("D")["value"].sum().reset_index() - - though note that, unlike pandas, polars doesn\'t add extra rows for empty - windows. If you need `index_column` to be evenly spaced, then please combine - with :func:`DataFrame.upsample`. - - 2) The `every`, `period` and `offset` arguments are created with - the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day (which may - not be 24 hours, due to daylight savings). Similarly for "calendar week", - "calendar month", "calendar quarter", and "calendar year". - - In case of a group_by_dynamic on an integer column, the windows are defined by: - - - "1i" # length 1 - - "10i" # length 10 - - Examples - -------- - >>> from datetime import datetime - >>> lf = pl.LazyFrame( - ... { - ... "time": pl.datetime_range( - ... start=datetime(2021, 12, 16), - ... end=datetime(2021, 12, 16, 3), - ... interval="30m", - ... eager=True, - ... ), - ... "n": range(7), - ... } - ... ) - >>> lf.collect() - shape: (7, 2) - ┌─────────────────────┬─────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i64 │ - ╞═════════════════════╪═════╡ - │ 2021-12-16 00:00:00 ┆ 0 │ - │ 2021-12-16 00:30:00 ┆ 1 │ - │ 2021-12-16 01:00:00 ┆ 2 │ - │ 2021-12-16 01:30:00 ┆ 3 │ - │ 2021-12-16 02:00:00 ┆ 4 │ - │ 2021-12-16 02:30:00 ┆ 5 │ - │ 2021-12-16 03:00:00 ┆ 6 │ - └─────────────────────┴─────┘ - - Group by windows of 1 hour starting at 2021-12-16 00:00:00. - - >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( - ... pl.col("n") - ... ).collect() - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [1, 2] │ - │ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ 2021-12-16 02:00:00 ┆ [5, 6] │ - └─────────────────────┴───────────┘ - - The window boundaries can also be added to the aggregation result - - >>> lf.group_by_dynamic( - ... "time", every="1h", include_boundaries=True, closed="right" - ... ).agg(pl.col("n").mean()).collect() - shape: (4, 4) - ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ - │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ - ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ - │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ - │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ - │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ - │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ - └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ - - When closed="left", the window excludes the right end of interval: - [lower_bound, upper_bound) - - >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( - ... pl.col("n") - ... ).collect() - shape: (4, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-16 00:00:00 ┆ [0, 1] │ - │ 2021-12-16 01:00:00 ┆ [2, 3] │ - │ 2021-12-16 02:00:00 ┆ [4, 5] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - When closed="both" the time values at the window boundaries belong to 2 groups. - - >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( - ... pl.col("n") - ... ).collect() - shape: (5, 2) - ┌─────────────────────┬───────────┐ - │ time ┆ n │ - │ --- ┆ --- │ - │ datetime[μs] ┆ list[i64] │ - ╞═════════════════════╪═══════════╡ - │ 2021-12-15 23:00:00 ┆ [0] │ - │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ - │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ - │ 2021-12-16 03:00:00 ┆ [6] │ - └─────────────────────┴───────────┘ - - Dynamic group bys can also be combined with grouping on normal keys - - >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) - >>> lf.collect() - shape: (7, 3) - ┌─────────────────────┬─────┬────────┐ - │ time ┆ n ┆ groups │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i64 ┆ str │ - ╞═════════════════════╪═════╪════════╡ - │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ - │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ - │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ - │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ - │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ - │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ - │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ - └─────────────────────┴─────┴────────┘ - >>> lf.group_by_dynamic( - ... "time", - ... every="1h", - ... closed="both", - ... by="groups", - ... include_boundaries=True, - ... ).agg(pl.col("n")).collect() - shape: (7, 5) - ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ - │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ - ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ - │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ - │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ - │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ - │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ - │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ - │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ - │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ - └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ - - Dynamic group by on an index column - - >>> lf = pl.LazyFrame( - ... { - ... "idx": pl.int_range(0, 6, eager=True), - ... "A": ["A", "A", "B", "B", "B", "C"], - ... } - ... ) - >>> lf.group_by_dynamic( - ... "idx", - ... every="2i", - ... period="3i", - ... include_boundaries=True, - ... closed="right", - ... ).agg(pl.col("A").alias("A_agg_list")).collect() - shape: (4, 4) - ┌─────────────────┬─────────────────┬─────┬─────────────────┐ - │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 ┆ list[str] │ - ╞═════════════════╪═════════════════╪═════╪═════════════════╡ - │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ - │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ - │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ - │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ - └─────────────────┴─────────────────┴─────┴─────────────────┘ - - ''' - def join_asof(self, other: LazyFrame) -> Self: - ''' - Perform an asof join. - - This is similar to a left-join except that we match on nearest key rather than - equal keys. - - Both DataFrames must be sorted by the join_asof key. - - For each row in the left DataFrame: - - - A "backward" search selects the last row in the right DataFrame whose - \'on\' key is less than or equal to the left\'s key. - - - A "forward" search selects the first row in the right DataFrame whose - \'on\' key is greater than or equal to the left\'s key. - - A "nearest" search selects the last row in the right DataFrame whose value - is nearest to the left\'s key. String keys are not currently supported for a - nearest search. - - The default is "backward". - - Parameters - ---------- - other - Lazy DataFrame to join with. - left_on - Join column of the left DataFrame. - right_on - Join column of the right DataFrame. - on - Join column of both DataFrames. If set, `left_on` and `right_on` should be - None. - by - Join on these columns before doing asof join. - by_left - Join on these columns before doing asof join. - by_right - Join on these columns before doing asof join. - strategy : {\'backward\', \'forward\', \'nearest\'} - Join strategy. - suffix - Suffix to append to columns with a duplicate name. - tolerance - Numeric tolerance. By setting this the join will only be done if the near - keys are within this distance. If an asof join is done on columns of dtype - "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta - object or the following string language: - - - 1ns (1 nanosecond) - - 1us (1 microsecond) - - 1ms (1 millisecond) - - 1s (1 second) - - 1m (1 minute) - - 1h (1 hour) - - 1d (1 calendar day) - - 1w (1 calendar week) - - 1mo (1 calendar month) - - 1q (1 calendar quarter) - - 1y (1 calendar year) - - 1i (1 index count) - - Or combine them: - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds - - By "calendar day", we mean the corresponding time on the next day - (which may not be 24 hours, due to daylight savings). Similarly for - "calendar week", "calendar month", "calendar quarter", and - "calendar year". - - allow_parallel - Allow the physical plan to optionally evaluate the computation of both - DataFrames up to the join in parallel. - force_parallel - Force the physical plan to evaluate the computation of both DataFrames up to - the join in parallel. - - Examples - -------- - >>> from datetime import datetime - >>> gdp = pl.LazyFrame( - ... { - ... "date": [ - ... datetime(2016, 1, 1), - ... datetime(2017, 1, 1), - ... datetime(2018, 1, 1), - ... datetime(2019, 1, 1), - ... ], # note record date: Jan 1st (sorted!) - ... "gdp": [4164, 4411, 4566, 4696], - ... } - ... ).set_sorted("date") - >>> population = pl.LazyFrame( - ... { - ... "date": [ - ... datetime(2016, 5, 12), - ... datetime(2017, 5, 12), - ... datetime(2018, 5, 12), - ... datetime(2019, 5, 12), - ... ], # note record date: May 12th (sorted!) - ... "population": [82.19, 82.66, 83.12, 83.52], - ... } - ... ).set_sorted("date") - >>> population.join_asof(gdp, on="date", strategy="backward").collect() - shape: (4, 3) - ┌─────────────────────┬────────────┬──────┐ - │ date ┆ population ┆ gdp │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ f64 ┆ i64 │ - ╞═════════════════════╪════════════╪══════╡ - │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ - │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ - │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ - │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ - └─────────────────────┴────────────┴──────┘ - - ''' - def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: - ''' - Add a join operation to the Logical Plan. - - Parameters - ---------- - other - Lazy DataFrame to join with. - on - Join column of both DataFrames. If set, `left_on` and `right_on` should be - None. - how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\'} - Join strategy. - - .. note:: - A left join preserves the row order of the left DataFrame. - left_on - Join column of the left DataFrame. - right_on - Join column of the right DataFrame. - suffix - Suffix to append to columns with a duplicate name. - validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} - Checks if join is of specified type. - - * *many_to_many* - “m:m”: default, does not result in checks - * *one_to_one* - “1:1”: check if join keys are unique in both left and right datasets - * *one_to_many* - “1:m”: check if join keys are unique in left dataset - * *many_to_one* - “m:1”: check if join keys are unique in right dataset - - .. note:: - - - This is currently not supported the streaming engine. - - This is only supported when joined by single columns. - allow_parallel - Allow the physical plan to optionally evaluate the computation of both - DataFrames up to the join in parallel. - force_parallel - Force the physical plan to evaluate the computation of both DataFrames up to - the join in parallel. - - See Also - -------- - join_asof - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> other_lf = pl.LazyFrame( - ... { - ... "apple": ["x", "y", "z"], - ... "ham": ["a", "b", "d"], - ... } - ... ) - >>> lf.join(other_lf, on="ham").collect() - shape: (2, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - └─────┴─────┴─────┴───────┘ - >>> lf.join(other_lf, on="ham", how="outer").collect() - shape: (4, 4) - ┌──────┬──────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞══════╪══════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ null ┆ null ┆ d ┆ z │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └──────┴──────┴─────┴───────┘ - >>> lf.join(other_lf, on="ham", how="left").collect() - shape: (3, 4) - ┌─────┬─────┬─────┬───────┐ - │ foo ┆ bar ┆ ham ┆ apple │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str ┆ str │ - ╞═════╪═════╪═════╪═══════╡ - │ 1 ┆ 6.0 ┆ a ┆ x │ - │ 2 ┆ 7.0 ┆ b ┆ y │ - │ 3 ┆ 8.0 ┆ c ┆ null │ - └─────┴─────┴─────┴───────┘ - >>> lf.join(other_lf, on="ham", how="semi").collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6.0 ┆ a │ - │ 2 ┆ 7.0 ┆ b │ - └─────┴─────┴─────┘ - >>> lf.join(other_lf, on="ham", how="anti").collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 3 ┆ 8.0 ┆ c │ - └─────┴─────┴─────┘ - - ''' - def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - ''' - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - LazyFrame - A new LazyFrame with the columns added. - - Notes - ----- - Creating a new LazyFrame using this method does not create a new copy of - existing data. - - Examples - -------- - Pass an expression to add it as a new column. - - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [0.5, 4, 10, 13], - ... "c": [True, True, False, True], - ... } - ... ) - >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() - shape: (4, 4) - ┌─────┬──────┬───────┬──────┐ - │ a ┆ b ┆ c ┆ a^2 │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 │ - ╞═════╪══════╪═══════╪══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ - └─────┴──────┴───────┴──────┘ - - Added columns will replace existing columns with the same name. - - >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() - shape: (4, 3) - ┌─────┬──────┬───────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╡ - │ 1.0 ┆ 0.5 ┆ true │ - │ 2.0 ┆ 4.0 ┆ true │ - │ 3.0 ┆ 10.0 ┆ false │ - │ 4.0 ┆ 13.0 ┆ true │ - └─────┴──────┴───────┘ - - Multiple columns can be added by passing a list of expressions. - - >>> lf.with_columns( - ... [ - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ] - ... ).collect() - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Multiple columns also can be added using positional arguments instead of a list. - - >>> lf.with_columns( - ... (pl.col("a") ** 2).alias("a^2"), - ... (pl.col("b") / 2).alias("b/2"), - ... (pl.col("c").not_()).alias("not c"), - ... ).collect() - shape: (4, 6) - ┌─────┬──────┬───────┬──────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ - └─────┴──────┴───────┴──────┴──────┴───────┘ - - Use keyword arguments to easily name your expression inputs. - - >>> lf.with_columns( - ... ab=pl.col("a") * pl.col("b"), - ... not_c=pl.col("c").not_(), - ... ).collect() - shape: (4, 5) - ┌─────┬──────┬───────┬──────┬───────┐ - │ a ┆ b ┆ c ┆ ab ┆ not_c │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ - ╞═════╪══════╪═══════╪══════╪═══════╡ - │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ - │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ - │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ - │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ - └─────┴──────┴───────┴──────┴───────┘ - - Expressions with multiple outputs can be automatically instantiated as Structs - by enabling the setting `Config.set_auto_structify(True)`: - - >>> with pl.Config(auto_structify=True): - ... lf.drop("c").with_columns( - ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), - ... ).collect() - ... - shape: (4, 3) - ┌─────┬──────┬─────────────┐ - │ a ┆ b ┆ diffs │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ struct[2] │ - ╞═════╪══════╪═════════════╡ - │ 1 ┆ 0.5 ┆ {null,null} │ - │ 2 ┆ 4.0 ┆ {1,3.5} │ - │ 3 ┆ 10.0 ┆ {1,6.0} │ - │ 4 ┆ 13.0 ┆ {1,3.0} │ - └─────┴──────┴─────────────┘ - - ''' - def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: - """ - Add columns to this DataFrame. - - Added columns will replace existing columns with the same name. - - This will run all expression sequentially instead of in parallel. - Use this when the work per expression is cheap. - - Parameters - ---------- - *exprs - Column(s) to add, specified as positional arguments. - Accepts expression input. Strings are parsed as column names, other - non-expression inputs are parsed as literals. - **named_exprs - Additional columns to add, specified as keyword arguments. - The columns will be renamed to the keyword used. - - Returns - ------- - LazyFrame - A new LazyFrame with the columns added. - - See Also - -------- - with_columns - - """ - def with_context(self, other: Self | list[Self]) -> Self: - ''' - Add an external context to the computation graph. - - This allows expressions to also access columns from DataFrames - that are not part of this one. - - Parameters - ---------- - other - Lazy DataFrame to join with. - - Examples - -------- - >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) - >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) - >>> lf.with_context(lf_other).select( - ... pl.col("b") + pl.col("c").first() - ... ).collect() - shape: (3, 1) - ┌──────┐ - │ b │ - │ --- │ - │ str │ - ╞══════╡ - │ afoo │ - │ cfoo │ - │ null │ - └──────┘ - - Fill nulls with the median from another DataFrame: - - >>> train_lf = pl.LazyFrame( - ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} - ... ) - >>> test_lf = pl.LazyFrame( - ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} - ... ) - >>> test_lf.with_context( - ... train_lf.select(pl.all().name.suffix("_train")) - ... ).select( - ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) - ... ).collect() - shape: (3, 1) - ┌───────────┐ - │ feature_0 │ - │ --- │ - │ f64 │ - ╞═══════════╡ - │ -1.0 │ - │ 0.0 │ - │ 1.0 │ - └───────────┘ - - ''' - def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: - ''' - Remove columns from the DataFrame. - - Parameters - ---------- - columns - Name of the column(s) that should be removed from the DataFrame. - *more_columns - Additional columns to drop, specified as positional arguments. - - Examples - -------- - Drop a single column by passing the name of that column. - - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.drop("ham").collect() - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪═════╡ - │ 1 ┆ 6.0 │ - │ 2 ┆ 7.0 │ - │ 3 ┆ 8.0 │ - └─────┴─────┘ - - Drop multiple columns by passing a selector. - - >>> import polars.selectors as cs - >>> lf.drop(cs.numeric()).collect() - shape: (3, 1) - ┌─────┐ - │ ham │ - │ --- │ - │ str │ - ╞═════╡ - │ a │ - │ b │ - │ c │ - └─────┘ - - Use positional arguments to drop multiple columns. - - >>> lf.drop("foo", "ham").collect() - shape: (3, 1) - ┌─────┐ - │ bar │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 6.0 │ - │ 7.0 │ - │ 8.0 │ - └─────┘ - - ''' - def rename(self, mapping: dict[str, str]) -> Self: - ''' - Rename column names. - - Parameters - ---------- - mapping - Key value pairs that map from old name to new name. - - Notes - ----- - If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), - polars will block projection and predicate pushdowns at this node. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, 7, 8], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.rename({"foo": "apple"}).collect() - shape: (3, 3) - ┌───────┬─────┬─────┐ - │ apple ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═══════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └───────┴─────┴─────┘ - - ''' - def reverse(self) -> Self: - ''' - Reverse the DataFrame. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "key": ["a", "b", "c"], - ... "val": [1, 2, 3], - ... } - ... ) - >>> lf.reverse().collect() - shape: (3, 2) - ┌─────┬─────┐ - │ key ┆ val │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ c ┆ 3 │ - │ b ┆ 2 │ - │ a ┆ 1 │ - └─────┴─────┘ - - ''' - def shift(self, n: int | IntoExprColumn = ...) -> Self: - ''' - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. Accepts expression input. - Non-expression inputs are parsed as literals. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [5, 6, 7, 8], - ... } - ... ) - >>> lf.shift().collect() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ null ┆ null │ - │ 1 ┆ 5 │ - │ 2 ┆ 6 │ - │ 3 ┆ 7 │ - └──────┴──────┘ - - Pass a negative value to shift in the opposite direction instead. - - >>> lf.shift(-2).collect() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞══════╪══════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ null ┆ null │ - │ null ┆ null │ - └──────┴──────┘ - - Specify `fill_value` to fill the resulting null values. - - >>> lf.shift(-2, fill_value=100).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 7 │ - │ 4 ┆ 8 │ - │ 100 ┆ 100 │ - │ 100 ┆ 100 │ - └─────┴─────┘ - - ''' - def slice(self, offset: int, length: int | None = ...) -> Self: - ''' - Get a slice of this DataFrame. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["x", "y", "z"], - ... "b": [1, 3, 5], - ... "c": [2, 4, 6], - ... } - ... ) - >>> lf.slice(1, 2).collect() - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ y ┆ 3 ┆ 4 │ - │ z ┆ 5 ┆ 6 │ - └─────┴─────┴─────┘ - - ''' - def limit(self, n: int = ...) -> Self: - ''' - Get the first `n` rows. - - Alias for :func:`LazyFrame.head`. - - Parameters - ---------- - n - Number of rows to return. - - Notes - ----- - Consider using the :func:`fetch` operation if you only want to test your - query. The :func:`fetch` operation will load the first `n` rows at the scan - level, whereas the :func:`head`/:func:`limit` are applied at the end. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4, 5, 6], - ... "b": [7, 8, 9, 10, 11, 12], - ... } - ... ) - >>> lf.limit().collect() - shape: (5, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - │ 4 ┆ 10 │ - │ 5 ┆ 11 │ - └─────┴─────┘ - >>> lf.limit(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - └─────┴─────┘ - - ''' - def head(self, n: int = ...) -> Self: - ''' - Get the first `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Notes - ----- - Consider using the :func:`fetch` operation if you only want to test your - query. The :func:`fetch` operation will load the first `n` rows at the scan - level, whereas the :func:`head`/:func:`limit` are applied at the end. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4, 5, 6], - ... "b": [7, 8, 9, 10, 11, 12], - ... } - ... ) - >>> lf.head().collect() - shape: (5, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - │ 4 ┆ 10 │ - │ 5 ┆ 11 │ - └─────┴─────┘ - >>> lf.head(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - └─────┴─────┘ - - ''' - def tail(self, n: int = ...) -> Self: - ''' - Get the last `n` rows. - - Parameters - ---------- - n - Number of rows to return. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4, 5, 6], - ... "b": [7, 8, 9, 10, 11, 12], - ... } - ... ) - >>> lf.tail().collect() - shape: (5, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - │ 4 ┆ 10 │ - │ 5 ┆ 11 │ - │ 6 ┆ 12 │ - └─────┴─────┘ - >>> lf.tail(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 5 ┆ 11 │ - │ 6 ┆ 12 │ - └─────┴─────┘ - - ''' - def last(self) -> Self: - ''' - Get the last row of the DataFrame. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> lf.last().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 5 ┆ 6 │ - └─────┴─────┘ - - ''' - def first(self) -> Self: - ''' - Get the first row of the DataFrame. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> lf.first().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 2 │ - └─────┴─────┘ - - ''' - def approx_n_unique(self) -> Self: - ''' - Approximate count of unique values. - - This is done using the HyperLogLog++ algorithm for cardinality estimation. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.approx_n_unique().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ u32 ┆ u32 │ - ╞═════╪═════╡ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def approx_unique(self) -> Self: - """ - Approximate count of unique values. - - .. deprecated:: 0.18.12 - This method has been renamed to :func:`LazyFrame.approx_n_unique`. - - """ - def with_row_count(self, name: str = ..., offset: int = ...) -> Self: - ''' - Add a column at index 0 that counts the rows. - - Parameters - ---------- - name - Name of the column to add. - offset - Start the row count at this offset. - - Warnings - -------- - This can have a negative effect on query performance. - This may, for instance, block predicate pushdown optimization. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 3, 5], - ... "b": [2, 4, 6], - ... } - ... ) - >>> lf.with_row_count().collect() - shape: (3, 3) - ┌────────┬─────┬─────┐ - │ row_nr ┆ a ┆ b │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ i64 ┆ i64 │ - ╞════════╪═════╪═════╡ - │ 0 ┆ 1 ┆ 2 │ - │ 1 ┆ 3 ┆ 4 │ - │ 2 ┆ 5 ┆ 6 │ - └────────┴─────┴─────┘ - - ''' - def gather_every(self, n: int) -> Self: - ''' - Take every nth row in the LazyFrame and return as a new LazyFrame. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [5, 6, 7, 8], - ... } - ... ) - >>> lf.gather_every(2).collect() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 5 │ - │ 3 ┆ 7 │ - └─────┴─────┘ - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: - ''' - Fill null values using the specified value or strategy. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - matches_supertype - Fill all matching supertypes of the fill `value` literal. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, None, 4], - ... "b": [0.5, 4, None, 13], - ... } - ... ) - >>> lf.fill_null(99).collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 99 ┆ 99.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - >>> lf.fill_null(strategy="forward").collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> lf.fill_null(strategy="max").collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 4 ┆ 13.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - >>> lf.fill_null(strategy="zero").collect() - shape: (4, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ f64 │ - ╞═════╪══════╡ - │ 1 ┆ 0.5 │ - │ 2 ┆ 4.0 │ - │ 0 ┆ 0.0 │ - │ 4 ┆ 13.0 │ - └─────┴──────┘ - - ''' - def fill_nan(self, value: int | float | Expr | None) -> Self: - ''' - Fill floating point NaN values. - - Parameters - ---------- - value - Value to fill the NaN values with. - - Warnings - -------- - Note that floating point NaN (Not a Number) are not missing values! - To replace missing values, use :func:`fill_null` instead. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1.5, 2, float("nan"), 4], - ... "b": [0.5, 4, float("nan"), 13], - ... } - ... ) - >>> lf.fill_nan(99).collect() - shape: (4, 2) - ┌──────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪══════╡ - │ 1.5 ┆ 0.5 │ - │ 2.0 ┆ 4.0 │ - │ 99.0 ┆ 99.0 │ - │ 4.0 ┆ 13.0 │ - └──────┴──────┘ - - ''' - def std(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns in the LazyFrame to their standard deviation value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.std().collect() - shape: (1, 2) - ┌──────────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════════╪═════╡ - │ 1.290994 ┆ 0.5 │ - └──────────┴─────┘ - >>> lf.std(ddof=0).collect() - shape: (1, 2) - ┌──────────┬──────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════════╪══════════╡ - │ 1.118034 ┆ 0.433013 │ - └──────────┴──────────┘ - - ''' - def var(self, ddof: int = ...) -> Self: - ''' - Aggregate the columns in the LazyFrame to their variance value. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.var().collect() - shape: (1, 2) - ┌──────────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════════╪══════╡ - │ 1.666667 ┆ 0.25 │ - └──────────┴──────┘ - >>> lf.var(ddof=0).collect() - shape: (1, 2) - ┌──────┬────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪════════╡ - │ 1.25 ┆ 0.1875 │ - └──────┴────────┘ - - ''' - def max(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their maximum value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.max().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 4 ┆ 2 │ - └─────┴─────┘ - - ''' - def min(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their minimum value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.min().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 1 │ - └─────┴─────┘ - - ''' - def sum(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their sum value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.sum().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 10 ┆ 5 │ - └─────┴─────┘ - - ''' - def mean(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their mean value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.mean().collect() - shape: (1, 2) - ┌─────┬──────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪══════╡ - │ 2.5 ┆ 1.25 │ - └─────┴──────┘ - - ''' - def median(self) -> Self: - ''' - Aggregate the columns in the LazyFrame to their median value. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.median().collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 2.5 ┆ 1.0 │ - └─────┴─────┘ - - ''' - def null_count(self) -> Self: - ''' - Aggregate the columns in the LazyFrame as the sum of their null value count. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, None, 3], - ... "bar": [6, 7, None], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> lf.null_count().collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ u32 ┆ u32 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 1 ┆ 0 │ - └─────┴─────┴─────┘ - - ''' - def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: - ''' - Aggregate the columns in the LazyFrame to their quantile value. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": [1, 2, 3, 4], - ... "b": [1, 2, 1, 1], - ... } - ... ) - >>> lf.quantile(0.7).collect() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═════╪═════╡ - │ 3.0 ┆ 1.0 │ - └─────┴─────┘ - - ''' - def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: - ''' - Explode the DataFrame to long format by exploding the given columns. - - Parameters - ---------- - columns - Column names, expressions, or a selector defining them. The underlying - columns being exploded must be of List or Utf8 datatype. - *more_columns - Additional names of columns to explode, specified as positional arguments. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "letters": ["a", "a", "b", "c"], - ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], - ... } - ... ) - >>> lf.explode("numbers").collect() - shape: (8, 2) - ┌─────────┬─────────┐ - │ letters ┆ numbers │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════════╪═════════╡ - │ a ┆ 1 │ - │ a ┆ 2 │ - │ a ┆ 3 │ - │ b ┆ 4 │ - │ b ┆ 5 │ - │ c ┆ 6 │ - │ c ┆ 7 │ - │ c ┆ 8 │ - └─────────┴─────────┘ - - ''' - def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: - ''' - Drop duplicate rows from this DataFrame. - - Parameters - ---------- - subset - Column name(s) or selector(s), to consider when identifying - duplicate rows. If set to `None` (default), use all columns. - keep : {\'first\', \'last\', \'any\', \'none\'} - Which of the duplicate rows to keep. - - * \'any\': Does not give any guarantee of which row is kept. - This allows more optimizations. - * \'none\': Don\'t keep duplicate rows. - * \'first\': Keep first unique row. - * \'last\': Keep last unique row. - maintain_order - Keep the same order as the original DataFrame. This is more expensive to - compute. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - Returns - ------- - LazyFrame - LazyFrame with unique rows. - - Warnings - -------- - This method will fail if there is a column of type `List` in the DataFrame or - subset. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3, 1], - ... "bar": ["a", "a", "a", "a"], - ... "ham": ["b", "b", "b", "b"], - ... } - ... ) - >>> lf.unique(maintain_order=True).collect() - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - >>> lf.unique(keep="last", maintain_order=True).collect() - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ a ┆ b │ - │ 3 ┆ a ┆ b │ - │ 1 ┆ a ┆ b │ - └─────┴─────┴─────┘ - - ''' - def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: - ''' - Drop all rows that contain null values. - - The original order of the remaining rows is preserved. - - Parameters - ---------- - subset - Column name(s) for which null values are considered. - If set to `None` (default), use all columns. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6, None, 8], - ... "ham": ["a", "b", None], - ... } - ... ) - - The default behavior of this method is to drop rows where any single - value of the row is null. - - >>> lf.drop_nulls().collect() - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - - This behaviour can be constrained to consider only a subset of columns, as - defined by name or with a selector. For example, dropping rows if there is - a null in any of the integer columns: - - >>> import polars.selectors as cs - >>> lf.drop_nulls(subset=cs.integer()).collect() - shape: (2, 3) - ┌─────┬─────┬──────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪══════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ null │ - └─────┴─────┴──────┘ - - This method drops a row if any single value of the row is null. - - Below are some example snippets that show how you could drop null - values based on other conditions: - - >>> lf = pl.LazyFrame( - ... { - ... "a": [None, None, None, None], - ... "b": [1, 2, None, 1], - ... "c": [1, None, None, 1], - ... } - ... ) - >>> lf.collect() - shape: (4, 3) - ┌──────┬──────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪══════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ null ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴──────┴──────┘ - - Drop a row only if all values are null: - - >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() - shape: (3, 3) - ┌──────┬─────┬──────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ f32 ┆ i64 ┆ i64 │ - ╞══════╪═════╪══════╡ - │ null ┆ 1 ┆ 1 │ - │ null ┆ 2 ┆ null │ - │ null ┆ 1 ┆ 1 │ - └──────┴─────┴──────┘ - - ''' - def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: - ''' - Unpivot a DataFrame from wide to long format. - - Optionally leaves identifiers set. - - This function is useful to massage a DataFrame into a format where one or more - columns are identifier variables (id_vars) while all other columns, considered - measured variables (value_vars), are "unpivoted" to the row axis leaving just - two non-identifier columns, \'variable\' and \'value\'. - - Parameters - ---------- - id_vars - Column(s) or selector(s) to use as identifier variables. - value_vars - Column(s) or selector(s) to use as values variables; if `value_vars` - is empty all columns that are not in `id_vars` will be used. - variable_name - Name to give to the `variable` column. Defaults to "variable" - value_name - Name to give to the `value` column. Defaults to "value" - streamable - Allow this node to run in the streaming engine. - If this runs in streaming, the output of the melt operation - will not have a stable ordering. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "a": ["x", "y", "z"], - ... "b": [1, 3, 5], - ... "c": [2, 4, 6], - ... } - ... ) - >>> import polars.selectors as cs - >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() - shape: (6, 3) - ┌─────┬──────────┬───────┐ - │ a ┆ variable ┆ value │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ i64 │ - ╞═════╪══════════╪═══════╡ - │ x ┆ b ┆ 1 │ - │ y ┆ b ┆ 3 │ - │ z ┆ b ┆ 5 │ - │ x ┆ c ┆ 2 │ - │ y ┆ c ┆ 4 │ - │ z ┆ c ┆ 6 │ - └─────┴──────────┴───────┘ - - ''' - def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: - ''' - Apply a custom function. - - It is important that the function returns a Polars DataFrame. - - Parameters - ---------- - function - Lambda/ function to apply. - predicate_pushdown - Allow predicate pushdown optimization to pass this node. - projection_pushdown - Allow projection pushdown optimization to pass this node. - slice_pushdown - Allow slice pushdown optimization to pass this node. - no_optimizations - Turn off all optimizations past this point. - schema - Output schema of the function, if set to `None` we assume that the schema - will remain unchanged by the applied function. - validate_output_schema - It is paramount that polars\' schema is correct. This flag will ensure that - the output schema of this function will be checked with the expected schema. - Setting this to `False` will not do this check, but may lead to hard to - debug bugs. - streamable - Whether the function that is given is eligible to be running with the - streaming engine. That means that the function must produce the same result - when it is executed in batches or when it is be executed on the full - dataset. - - Warnings - -------- - The `schema` of a `LazyFrame` must always be correct. It is up to the caller - of this function to ensure that this invariant is upheld. - - It is important that the optimization flags are correct. If the custom function - for instance does an aggregation of a column, `predicate_pushdown` should not - be allowed, as this prunes rows and will influence your aggregation results. - - Examples - -------- - >>> lf = ( # doctest: +SKIP - ... pl.LazyFrame( - ... { - ... "a": pl.int_range(-100_000, 0, eager=True), - ... "b": pl.int_range(0, 100_000, eager=True), - ... } - ... ) - ... .map_batches(lambda x: 2 * x, streamable=True) - ... .collect(streaming=True) - ... ) - shape: (100_000, 2) - ┌─────────┬────────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════════╪════════╡ - │ -200000 ┆ 0 │ - │ -199998 ┆ 2 │ - │ -199996 ┆ 4 │ - │ -199994 ┆ 6 │ - │ … ┆ … │ - │ -8 ┆ 199992 │ - │ -6 ┆ 199994 │ - │ -4 ┆ 199996 │ - │ -2 ┆ 199998 │ - └─────────┴────────┘ - - ''' - def interpolate(self) -> Self: - ''' - Interpolate intermediate values. The interpolation method is linear. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "foo": [1, None, 9, 10], - ... "bar": [6, 7, 9, None], - ... "baz": [1, None, None, 9], - ... } - ... ) - >>> lf.interpolate().collect() - shape: (4, 3) - ┌──────┬──────┬──────────┐ - │ foo ┆ bar ┆ baz │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 │ - ╞══════╪══════╪══════════╡ - │ 1.0 ┆ 6.0 ┆ 1.0 │ - │ 5.0 ┆ 7.0 ┆ 3.666667 │ - │ 9.0 ┆ 9.0 ┆ 6.333333 │ - │ 10.0 ┆ null ┆ 9.0 │ - └──────┴──────┴──────────┘ - - ''' - def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: - ''' - Decompose struct columns into separate columns for each of their fields. - - The new columns will be inserted into the DataFrame at the location of the - struct column. - - Parameters - ---------- - columns - Name of the struct column(s) that should be unnested. - *more_columns - Additional columns to unnest, specified as positional arguments. - - Examples - -------- - >>> df = pl.LazyFrame( - ... { - ... "before": ["foo", "bar"], - ... "t_a": [1, 2], - ... "t_b": ["a", "b"], - ... "t_c": [True, None], - ... "t_d": [[1, 2], [3]], - ... "after": ["baz", "womp"], - ... } - ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") - >>> df.collect() - shape: (2, 3) - ┌────────┬─────────────────────┬───────┐ - │ before ┆ t_struct ┆ after │ - │ --- ┆ --- ┆ --- │ - │ str ┆ struct[4] ┆ str │ - ╞════════╪═════════════════════╪═══════╡ - │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ - │ bar ┆ {2,"b",null,[3]} ┆ womp │ - └────────┴─────────────────────┴───────┘ - >>> df.unnest("t_struct").collect() - shape: (2, 6) - ┌────────┬─────┬─────┬──────┬───────────┬───────┐ - │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ - ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ - │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ - │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ - └────────┴─────┴─────┴──────┴───────────┴───────┘ - - ''' - def merge_sorted(self, other: LazyFrame, key: str) -> Self: - ''' - Take two sorted DataFrames and merge them by the sorted key. - - The output of this operation will also be sorted. - It is the callers responsibility that the frames are sorted - by that key otherwise the output will not make sense. - - The schemas of both LazyFrames must be equal. - - Parameters - ---------- - other - Other DataFrame that must be merged - key - Key that is sorted. - - Examples - -------- - >>> df0 = pl.LazyFrame( - ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} - ... ).sort("age") - >>> df0.collect() - shape: (3, 2) - ┌───────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═══════╪═════╡ - │ bob ┆ 18 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └───────┴─────┘ - >>> df1 = pl.LazyFrame( - ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} - ... ).sort("age") - >>> df1.collect() - shape: (4, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - └────────┴─────┘ - >>> df0.merge_sorted(df1, key="age").collect() - shape: (7, 2) - ┌────────┬─────┐ - │ name ┆ age │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════╪═════╡ - │ bob ┆ 18 │ - │ thomas ┆ 20 │ - │ anna ┆ 21 │ - │ megan ┆ 33 │ - │ steve ┆ 42 │ - │ steve ┆ 42 │ - │ elise ┆ 44 │ - └────────┴─────┘ - ''' - def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: - """ - Indicate that one or multiple columns are sorted. - - Parameters - ---------- - column - Columns that are sorted - more_columns - Additional columns that are sorted, specified as positional arguments. - descending - Whether the columns are sorted in descending order. - """ - def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., left_on: str | Sequence[str] | None = ..., right_on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ..., include_nulls: bool | None = ...) -> Self: - ''' - Update the values in this `LazyFrame` with the non-null values in `other`. - - Parameters - ---------- - other - LazyFrame that will be used to update the values - on - Column names that will be joined on; if given `None` the implicit row - index is used as a join key instead. - left_on - Join column(s) of the left DataFrame. - right_on - Join column(s) of the right DataFrame. - how : {\'left\', \'inner\', \'outer\'} - * \'left\' will keep all rows from the left table; rows may be duplicated - if multiple rows in the right frame match the left row\'s key. - * \'inner\' keeps only those rows where the key exists in both frames. - * \'outer\' will update existing rows where the key matches while also - adding any new rows contained in the given frame. - include_nulls - If True, null values from the right DataFrame will be used to update the - left DataFrame. - - Notes - ----- - This is syntactic sugar for a left/inner join, with an optional coalesce when - `include_nulls = False`. - - Examples - -------- - >>> lf = pl.LazyFrame( - ... { - ... "A": [1, 2, 3, 4], - ... "B": [400, 500, 600, 700], - ... } - ... ) - >>> lf.collect() - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 400 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - >>> new_lf = pl.LazyFrame( - ... { - ... "B": [-66, None, -99], - ... "C": [5, 3, 1], - ... } - ... ) - - Update `df` values with the non-null values in `new_df`, by row index: - - >>> lf.update(new_lf).collect() - shape: (4, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - │ 4 ┆ 700 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, by row index, - but only keeping those rows that are common to both frames: - - >>> lf.update(new_lf, how="inner").collect() - shape: (3, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -66 │ - │ 2 ┆ 500 │ - │ 3 ┆ -99 │ - └─────┴─────┘ - - Update `df` values with the non-null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() - shape: (5, 2) - ┌─────┬─────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ 600 │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴─────┘ - - Update `df` values including null values in `new_df`, using an outer join - strategy that defines explicit join columns in each frame: - - >>> lf.update( - ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True - ... ).collect() - shape: (5, 2) - ┌─────┬──────┐ - │ A ┆ B │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪══════╡ - │ 1 ┆ -99 │ - │ 2 ┆ 500 │ - │ 3 ┆ null │ - │ 4 ┆ 700 │ - │ 5 ┆ -66 │ - └─────┴──────┘ - - ''' - def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: - """ - Start a group by operation. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.group_by`. - - Parameters - ---------- - by - Column(s) to group by. Accepts expression input. Strings are parsed as - column names. - *more_by - Additional columns to group by, specified as positional arguments. - maintain_order - Ensure that the order of the groups is consistent with the input data. - This is slower than a default group by. - Settings this to `True` blocks the possibility - to run on the streaming engine. - - """ - def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - """ - def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: - """ - Create rolling groups based on a time, Int32, or Int64 column. - - .. deprecated:: 0.19.9 - This method has been renamed to :func:`LazyFrame.rolling`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a rolling group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - period - length of the window - must be non-negative - offset - offset of the window. Default is -period - closed : {'right', 'left', 'both', 'none'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - """ - def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: - ''' - Group based on a time value (or index value of type Int32, Int64). - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.group_by_dynamic`. - - Parameters - ---------- - index_column - Column used to group based on the time window. - Often of type Date/Datetime. - This column must be sorted in ascending order (or, if `by` is specified, - then it must be sorted in ascending order within each group). - - In case of a dynamic group by on indices, dtype needs to be one of - {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if - performance matters use an Int64 column. - every - interval of the window - period - length of the window, if None it will equal \'every\' - offset - offset of the window, only takes effect if `start_by` is `\'window\'`. - Defaults to negative `every`. - truncate - truncate the time value to the window lower bound - include_boundaries - Add the lower and upper bound of the window to the "_lower_bound" and - "_upper_bound" columns. This will impact performance because it\'s harder to - parallelize - closed : {\'right\', \'left\', \'both\', \'none\'} - Define which sides of the temporal interval are closed (inclusive). - by - Also group by this column/these columns - start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} - The strategy to determine the start of the first window by. - - * \'window\': Start by taking the earliest timestamp, truncating it with - `every`, and then adding `offset`. - Note that weekly windows start on Monday. - * \'datapoint\': Start from the first encountered data point. - * a day of the week (only takes effect if `every` contains `\'w\'`): - - * \'monday\': Start the window on the Monday before the first data point. - * \'tuesday\': Start the window on the Tuesday before the first data point. - * ... - * \'sunday\': Start the window on the Sunday before the first data point. - check_sorted - When the `by` argument is given, polars can not check sortedness - by the metadata and has to do a full scan on the index column to - verify data is sorted. This is expensive. If you are sure the - data within the by groups is sorted, you can set this to `False`. - Doing so incorrectly will lead to incorrect output - - Returns - ------- - LazyGroupBy - Object you can call `.agg` on to aggregate by groups, the result - of which will be sorted by `index_column` (but note that if `by` columns are - passed, it will only be sorted within each `by` group). - - ''' - def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: - """ - Apply a custom function. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`LazyFrame.map_batches`. - - Parameters - ---------- - function - Lambda/ function to apply. - predicate_pushdown - Allow predicate pushdown optimization to pass this node. - projection_pushdown - Allow projection pushdown optimization to pass this node. - slice_pushdown - Allow slice pushdown optimization to pass this node. - no_optimizations - Turn off all optimizations past this point. - schema - Output schema of the function, if set to `None` we assume that the schema - will remain unchanged by the applied function. - validate_output_schema - It is paramount that polars' schema is correct. This flag will ensure that - the output schema of this function will be checked with the expected schema. - Setting this to `False` will not do this check, but may lead to hard to - debug bugs. - streamable - Whether the function that is given is eligible to be running with the - streaming engine. That means that the function must produce the same result - when it is executed in batches or when it is be executed on the full - dataset. - - """ - def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - fill None values with the result of this expression. - n - Number of places to shift (may be negative). - - """ - def take_every(self, n: int) -> Self: - """ - Take every nth row in the LazyFrame and return as a new LazyFrame. - - .. deprecated:: 0.19.0 - This method has been renamed to :meth:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - @property - def columns(self): ... - @property - def dtypes(self): ... - @property - def schema(self): ... - @property - def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..ddd60e0 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/lazyframe/frame.pyi @@ -0,0 +1,4209 @@ +#: version 0.20.3 +import P +import np +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, String as String, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.lazyframe.in_process import InProcessQuery as InProcessQuery +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_count_args as _prepare_row_count_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use `pl.scan_csv` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + + """ + @classmethod + def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use `pl.scan_parquet` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + + """ + @classmethod + def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use `pl.scan_ipc` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + + """ + @classmethod + def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use `pl.scan_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to `True`. + If this is set to `True` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and passes + on the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the DataFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If \'descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If \'descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the \'k\' smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + + ''' + def collect(self) -> DataFrame | InProcessQuery: + ''' + Materialize this LazyFrame into a DataFrame. + + By default, all query optimizations are enabled. Individual optimizations may + be disabled by setting the corresponding parameter to `False`. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + no_optimization + Turn off (certain) optimizations. + streaming + Process the query in batches to handle larger-than-memory data. + If set to `False` (default), the entire query is processed in a single + batch. + + .. warning:: + This functionality is currently in an alpha state. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + background + Run the query in the background and get a handle to the query. + This handle can be used to fetch the result or cancel the query. + + Returns + ------- + DataFrame + + See Also + -------- + fetch: Run the query on the first `n` rows only for debugging purposes. + explain : Print the query plan that is evaluated with collect. + profile : Collect the LazyFrame and time each node in the computation graph. + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.Config.set_streaming_chunk_size : Set the size of streaming batches. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + Collect in streaming mode + + >>> lf.group_by("a").agg(pl.all().sum()).collect( + ... streaming=True + ... ) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + DataFrame directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a Parquet file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an IPC file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a CSV file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the + separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + + ''' + def sink_ndjson(self, path: str | Path) -> DataFrame: + ''' + Persists a LazyFrame at the provided path. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_json("out.json") # doctest: +SKIP + + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that `fetch` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if `n_rows` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.String).collect().to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.String}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + + ''' + def clone(self) -> Self: + ''' + Create a copy of this LazyFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters; use `name = value` to filter columns by the supplied value. + Each constraint will behave the same as `pl.col(name).eq(value)`, and + will be implicitly joined with the other filter conditions using `&`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") > 1).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> lf.filter( + ... pl.col("foo") == 1, + ... pl.col("ham") == "a", + ... ).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> lf.filter(foo=1, ham="a").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Setting this to `True` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + ''' + def rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `dynamic_group_by` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.rolling(index_column="dt", period="2d") + ... .agg( + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ) + ... .collect() + ... ) + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\', \'outer_coalesce\'} + Join strategy. + + * *inner* + Returns rows that have matching values in both tables + * *left* + Returns all rows from the left table, and the matched rows from the + right table + * *outer* + Returns all rows when there is a match in either left or right table + * *outer_coalesce* + Same as \'outer\', but coalesces the key columns + * *cross* + Returns the cartisian product of rows from both tables + * *semi* + Filter rows that have a match in the right table. + * *anti* + Filter rows that not have a match in the right table. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + join_nulls + Join on null values. By default null values will never produce matches. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 5) + ┌──────┬──────┬──────┬───────┬───────────┐ + │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞══════╪══════╪══════╪═══════╪═══════════╡ + │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │ + │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │ + │ null ┆ null ┆ null ┆ z ┆ d │ + │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │ + └──────┴──────┴──────┴───────┴───────────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ).collect() + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another DataFrame: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context( + ... train_lf.select(pl.all().name.suffix("_train")) + ... ).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + + ''' + def drop(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Remove columns from the DataFrame. + + Parameters + ---------- + columns + Name of the column(s) that should be removed from the DataFrame. + *more_columns + Additional columns to drop, specified as positional arguments. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), + polars will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.shift().collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> lf.shift(-2).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> lf.shift(-2, fill_value=100).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + + ''' + def gather_every(self, n: int, offset: int = ...) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.gather_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + >>> lf.gather_every(2, offset=1).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill `value` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the DataFrame to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or String datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this DataFrame. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The `schema` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, `predicate_pushdown` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the DataFrame at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + .. warning:: + This functionality is experimental and may change without it being + considered a breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on; if given `None` the implicit row + index is used as a join key instead. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + include_nulls + If True, null values from the right DataFrame will be used to update the + left DataFrame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> lf.collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_lf = pl.LazyFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> lf.update(new_lf).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> lf.update(new_lf, how="inner").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update( + ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... ).collect() + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + + ''' + def count(self) -> Self: + ''' + Return the number of non-null elements for each column. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... {"a": [1, 2, 3, 4], "b": [1, 2, 1, None], "c": [None, None, None, None]} + ... ) + >>> lf.count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 3 ┆ 0 │ + └─────┴─────┴─────┘ + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + """ + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def take_every(self, n: int, offset: int = ...) -> Self: + """ + Take every nth row in the LazyFrame and return as a new LazyFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/series/series b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/series/series deleted file mode 100644 index 4a40006..0000000 --- a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/series/series +++ /dev/null @@ -1,4988 +0,0 @@ -import np as np -import pa as pa -import pd as pd -from builtins import PySeries -from datetime import date, datetime, timedelta -from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Object as Object, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown, Utf8 as Utf8 -from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code -from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat -from polars.exceptions import ShapeError as ShapeError -from polars.series.array import ArrayNameSpace as ArrayNameSpace -from polars.series.binary import BinaryNameSpace as BinaryNameSpace -from polars.series.categorical import CatNameSpace as CatNameSpace -from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace -from polars.series.list import ListNameSpace as ListNameSpace -from polars.series.string import StringNameSpace as StringNameSpace -from polars.series.struct import StructNameSpace as StructNameSpace -from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func -from polars.slice import PolarsSlice as PolarsSlice -from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries -from polars.utils._wrap import wrap_df as wrap_df -from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta -from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning -from polars.utils.meta import get_index_type as get_index_type -from polars.utils.various import _is_generator as _is_generator, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor -from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, NoReturn, Sequence - -TYPE_CHECKING: bool -_PYARROW_AVAILABLE: bool - -class Series: - _s: _ClassVar[None] = ... - _accessors: _ClassVar[set] = ... - def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... - @classmethod - def _from_pyseries(cls, pyseries: PySeries) -> Self: ... - @classmethod - def _from_arrow(cls, name: str, values: pa.Array) -> Self: - """Construct a Series from an Arrow Array.""" - @classmethod - def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: - """Construct a Series from a pandas Series or DatetimeIndex.""" - def _get_ptr(self) -> tuple[int, int, int]: - """ - Get a pointer to the start of the values buffer of a numeric Series. - - This will raise an error if the `Series` contains multiple chunks. - - This will return the offset, length and the pointer itself. - - """ - def __bool__(self) -> NoReturn: ... - def __len__(self) -> int: ... - def __and__(self, other: Series) -> Self: ... - def __rand__(self, other: Series) -> Series: ... - def __or__(self, other: Series) -> Self: ... - def __ror__(self, other: Series) -> Series: ... - def __xor__(self, other: Series) -> Self: ... - def __rxor__(self, other: Series) -> Series: ... - def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... - def __eq__(self, other: Any) -> Series | Expr: ... - def __ne__(self, other: Any) -> Series | Expr: ... - def __gt__(self, other: Any) -> Series | Expr: ... - def __lt__(self, other: Any) -> Series | Expr: ... - def __ge__(self, other: Any) -> Series | Expr: ... - def __le__(self, other: Any) -> Series | Expr: ... - def le(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series <= other`.""" - def lt(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series < other`.""" - def eq(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series == other`.""" - def eq_missing(self, other: Any) -> Self | Expr: - ''' - Method equivalent of equality operator `series == other` where `None == None`. - - This differs from the standard `ne` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - See Also - -------- - ne_missing - eq - - Examples - -------- - >>> s1 = pl.Series("a", [333, 200, None]) - >>> s2 = pl.Series("a", [100, 200, None]) - >>> s1.eq(s2) - shape: (3,) - Series: \'a\' [bool] - [ - false - true - null - ] - >>> s1.eq_missing(s2) - shape: (3,) - Series: \'a\' [bool] - [ - false - true - true - ] - - ''' - def ne(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series != other`.""" - def ne_missing(self, other: Any) -> Self | Expr: - ''' - Method equivalent of equality operator `series != other` where `None == None`. - - This differs from the standard `ne` where null values are propagated. - - Parameters - ---------- - other - A literal or expression value to compare with. - - See Also - -------- - eq_missing - ne - - Examples - -------- - >>> s1 = pl.Series("a", [333, 200, None]) - >>> s2 = pl.Series("a", [100, 200, None]) - >>> s1.ne(s2) - shape: (3,) - Series: \'a\' [bool] - [ - true - false - null - ] - >>> s1.ne_missing(s2) - shape: (3,) - Series: \'a\' [bool] - [ - true - false - false - ] - - ''' - def ge(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series >= other`.""" - def gt(self, other: Any) -> Self | Expr: - """Method equivalent of operator expression `series > other`.""" - def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... - def __add__(self, other: Any) -> Self | DataFrame | Expr: ... - def __sub__(self, other: Any) -> Self | Expr: ... - def __truediv__(self, other: Any) -> Series | Expr: ... - def __floordiv__(self, other: Any) -> Series | Expr: ... - def __invert__(self) -> Series: ... - def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... - def __mod__(self, other: Any) -> Series | Expr: ... - def __rmod__(self, other: Any) -> Series: ... - def __radd__(self, other: Any) -> Series: ... - def __rsub__(self, other: Any) -> Series: ... - def __rtruediv__(self, other: Any) -> Series: ... - def __rfloordiv__(self, other: Any) -> Series: ... - def __rmul__(self, other: Any) -> Series: ... - def __pow__(self, exponent: int | float | None | Series) -> Series: ... - def __rpow__(self, other: Any) -> Series: ... - def __matmul__(self, other: Any) -> float | Series | None: ... - def __rmatmul__(self, other: Any) -> float | Series | None: ... - def __neg__(self) -> Series: ... - def __pos__(self) -> Series: ... - def __abs__(self) -> Series: ... - def __copy__(self) -> Self: ... - def __deepcopy__(self, memo: None = ...) -> Self: ... - def __contains__(self, item: Any) -> bool: ... - def __iter__(self) -> Generator[Any, None, None]: ... - def _pos_idxs(self, size: int) -> Series: ... - def _take_with_series(self, s: Series) -> Series: ... - def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... - def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... - def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: - """ - Numpy __array__ interface protocol. - - Ensures that `np.asarray(pl.Series(..))` works as expected, see - https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. - """ - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: - """Numpy universal functions.""" - def __column_consortium_standard__(self) -> Any: - """ - Provide entry point to the Consortium DataFrame Standard API. - - This is developed and maintained outside of polars. - Please report any issues to https://github.com/data-apis/dataframe-api-compat. - """ - def _repr_html_(self) -> str: - """Format output data in HTML for display in Jupyter Notebooks.""" - def item(self, index: int | None = ...) -> Any: - ''' - Return the Series as a scalar, or return the element at the given index. - - If no index is provided, this is equivalent to `s[0]`, with a check - that the shape is (1,). With an index, this is equivalent to `s[index]`. - - Examples - -------- - >>> s1 = pl.Series("a", [1]) - >>> s1.item() - 1 - >>> s2 = pl.Series("a", [9, 8, 7]) - >>> s2.cum_sum().item(-1) - 24 - - ''' - def estimated_size(self, unit: SizeUnit = ...) -> int | float: - ''' - Return an estimation of the total (heap) allocated size of the Series. - - Estimated size is given in the specified unit (bytes by default). - - This estimation is the sum of the size of its buffers, validity, including - nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the - size of 2 arrays is not the sum of the sizes computed from this function. In - particular, [`StructArray`]\'s size is an upper bound. - - When an array is sliced, its allocated size remains constant because the buffer - unchanged. However, this function will yield a smaller number. This is because - this function returns the visible size of the buffer, not its total capacity. - - FFI buffers are included in this estimation. - - Parameters - ---------- - unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} - Scale the returned size to the given unit. - - Examples - -------- - >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) - >>> s.estimated_size() - 4000000 - >>> s.estimated_size("mb") - 3.814697265625 - - ''' - def sqrt(self) -> Series: - """ - Compute the square root of the elements. - - Syntactic sugar for - - >>> pl.Series([1, 2]) ** 0.5 - shape: (2,) - Series: '' [f64] - [ - 1.0 - 1.414214 - ] - - """ - def cbrt(self) -> Series: - """ - Compute the cube root of the elements. - - Optimization for - - >>> pl.Series([1, 2]) ** (1.0 / 3) - shape: (2,) - Series: '' [f64] - [ - 1.0 - 1.259921 - ] - - """ - def any(self) -> bool | None: - """ - Return whether any of the values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is `None`. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - bool or None - - Examples - -------- - >>> pl.Series([True, False]).any() - True - >>> pl.Series([False, False]).any() - False - >>> pl.Series([None, False]).any() - False - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None - - """ - def all(self) -> bool | None: - """ - Return whether all values in the column are `True`. - - Only works on columns of data type :class:`Boolean`. - - Parameters - ---------- - ignore_nulls - Ignore null values (default). - - If set to `False`, `Kleene logic`_ is used to deal with nulls: - if the column contains any null values and no `True` values, - the output is `None`. - - .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic - - Returns - ------- - bool or None - - Examples - -------- - >>> pl.Series([True, True]).all() - True - >>> pl.Series([False, True]).all() - False - >>> pl.Series([None, True]).all() - True - - Enable Kleene logic by setting `ignore_nulls=False`. - - >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None - - """ - def log(self, base: float = ...) -> Series: - """Compute the logarithm to a given base.""" - def log1p(self) -> Series: - """Compute the natural logarithm of the input array plus one, element-wise.""" - def log10(self) -> Series: - """Compute the base 10 logarithm of the input array, element-wise.""" - def exp(self) -> Series: - """Compute the exponential, element-wise.""" - def drop_nulls(self) -> Series: - ''' - Drop all null values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nans - - Notes - ----- - A null value is not the same as a NaN value. - To drop NaN values, use :func:`drop_nans`. - - Examples - -------- - >>> s = pl.Series([1.0, None, 3.0, float("nan")]) - >>> s.drop_nulls() - shape: (3,) - Series: \'\' [f64] - [ - 1.0 - 3.0 - NaN - ] - - ''' - def drop_nans(self) -> Series: - ''' - Drop all floating point NaN values. - - The original order of the remaining elements is preserved. - - See Also - -------- - drop_nulls - - Notes - ----- - A NaN value is not the same as a null value. - To drop null values, use :func:`drop_nulls`. - - Examples - -------- - >>> s = pl.Series([1.0, None, 3.0, float("nan")]) - >>> s.drop_nans() - shape: (3,) - Series: \'\' [f64] - [ - 1.0 - null - 3.0 - ] - - ''' - def to_frame(self, name: str | None = ...) -> DataFrame: - ''' - Cast this Series to a DataFrame. - - Parameters - ---------- - name - optionally name/rename the Series column in the new DataFrame. - - Examples - -------- - >>> s = pl.Series("a", [123, 456]) - >>> df = s.to_frame() - >>> df - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 123 │ - │ 456 │ - └─────┘ - - >>> df = s.to_frame("xyz") - >>> df - shape: (2, 1) - ┌─────┐ - │ xyz │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 123 │ - │ 456 │ - └─────┘ - - ''' - def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: - ''' - Quick summary statistics of a Series. - - Series with mixed datatypes will return summary statistics for the datatype of - the first value. - - Parameters - ---------- - percentiles - One or more percentiles to include in the summary statistics (if the - Series has a numeric dtype). All values must be in the range `[0, 1]`. - - Notes - ----- - The median is included by default as the 50% percentile. - - Returns - ------- - DataFrame - Mapping with summary statistics of a Series. - - Examples - -------- - >>> series_num = pl.Series([1, 2, 3, 4, 5]) - >>> series_num.describe() - shape: (9, 2) - ┌────────────┬──────────┐ - │ statistic ┆ value │ - │ --- ┆ --- │ - │ str ┆ f64 │ - ╞════════════╪══════════╡ - │ count ┆ 5.0 │ - │ null_count ┆ 0.0 │ - │ mean ┆ 3.0 │ - │ std ┆ 1.581139 │ - │ min ┆ 1.0 │ - │ 25% ┆ 2.0 │ - │ 50% ┆ 3.0 │ - │ 75% ┆ 4.0 │ - │ max ┆ 5.0 │ - └────────────┴──────────┘ - - >>> series_str = pl.Series(["a", "a", None, "b", "c"]) - >>> series_str.describe() - shape: (3, 2) - ┌────────────┬───────┐ - │ statistic ┆ value │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞════════════╪═══════╡ - │ count ┆ 5 │ - │ null_count ┆ 1 │ - │ unique ┆ 4 │ - └────────────┴───────┘ - - ''' - def sum(self) -> int | float: - ''' - Reduce this Series to the sum value. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.sum() - 6 - - ''' - def mean(self) -> int | float | None: - ''' - Reduce this Series to the mean value. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.mean() - 2.0 - - ''' - def product(self) -> int | float: - """Reduce this Series to the product value.""" - def pow(self, exponent: int | float | None | Series) -> Series: - ''' - Raise to the power of the given exponent. - - Parameters - ---------- - exponent - The exponent. Accepts Series input. - - Examples - -------- - >>> s = pl.Series("foo", [1, 2, 3, 4]) - >>> s.pow(3) - shape: (4,) - Series: \'foo\' [f64] - [ - 1.0 - 8.0 - 27.0 - 64.0 - ] - - ''' - def min(self) -> PythonLiteral | None: - ''' - Get the minimal value in this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.min() - 1 - - ''' - def max(self) -> PythonLiteral | None: - ''' - Get the maximum value in this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.max() - 3 - - ''' - def nan_max(self) -> int | float | date | datetime | timedelta | str: - """ - Get maximum value, but propagate/poison encountered NaN values. - - This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - """ - def nan_min(self) -> int | float | date | datetime | timedelta | str: - """ - Get minimum value, but propagate/poison encountered NaN values. - - This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, - whereas polars defaults to ignoring them. - - """ - def std(self, ddof: int = ...) -> float | None: - ''' - Get the standard deviation of this Series. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.std() - 1.0 - - ''' - def var(self, ddof: int = ...) -> float | None: - ''' - Get variance of this Series. - - Parameters - ---------- - ddof - “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, - where N represents the number of elements. - By default ddof is 1. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.var() - 1.0 - - ''' - def median(self) -> float | None: - ''' - Get the median of this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.median() - 2.0 - - ''' - def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: - ''' - Get the quantile value of this Series. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.quantile(0.5) - 2.0 - - ''' - def to_dummies(self, separator: str = ...) -> DataFrame: - ''' - Get dummy/indicator variables. - - Parameters - ---------- - separator - Separator/delimiter used when generating column names. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.to_dummies() - shape: (3, 3) - ┌─────┬─────┬─────┐ - │ a_1 ┆ a_2 ┆ a_3 │ - │ --- ┆ --- ┆ --- │ - │ u8 ┆ u8 ┆ u8 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 0 ┆ 0 │ - │ 0 ┆ 1 ┆ 0 │ - │ 0 ┆ 0 ┆ 1 │ - └─────┴─────┴─────┘ - - ''' - def cut(self, breaks: Sequence[float]) -> Series | DataFrame: - ''' - Bin continuous values into discrete categories. - - Parameters - ---------- - breaks - List of unique cut points. - labels - Names of the categories. The number of labels must be equal to the number - of cut points plus one. - break_point_label - Name of the breakpoint column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - category_label - Name of the category column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - left_closed - Set the intervals to be left-closed instead of right-closed. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - as_series - If set to `False`, return a DataFrame containing the original values, - the breakpoints, and the categories. - - .. deprecated:: 0.19.0 - This parameter will be removed. The same behavior can be achieved by - setting `include_breaks=True`, unnesting the resulting struct Series, - and adding the result to the original Series. - - Returns - ------- - Series - Series of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise a Series of data type :class:`Struct`. - - See Also - -------- - qcut - - Examples - -------- - Divide the column into three categories. - - >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) - >>> s.cut([-1, 1], labels=["a", "b", "c"]) - shape: (5,) - Series: \'foo\' [cat] - [ - "a" - "a" - "b" - "b" - "c" - ] - - Create a DataFrame with the breakpoint and category for each value. - - >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") - >>> s.to_frame().with_columns(cut).unnest("cut") - shape: (5, 3) - ┌─────┬─────────────┬────────────┐ - │ foo ┆ break_point ┆ category │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪═════════════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴─────────────┴────────────┘ - - ''' - def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: - ''' - Bin continuous values into discrete categories based on their quantiles. - - Parameters - ---------- - quantiles - Either a list of quantile probabilities between 0 and 1 or a positive - integer determining the number of bins with uniform probability. - labels - Names of the categories. The number of labels must be equal to the number - of cut points plus one. - left_closed - Set the intervals to be left-closed instead of right-closed. - allow_duplicates - If set to `True`, duplicates in the resulting quantiles are dropped, - rather than raising a `DuplicateError`. This can happen even with unique - probabilities, depending on the data. - include_breaks - Include a column with the right endpoint of the bin each observation falls - in. This will change the data type of the output from a - :class:`Categorical` to a :class:`Struct`. - break_point_label - Name of the breakpoint column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - category_label - Name of the category column. Only used if `include_breaks` is set to - `True`. - - .. deprecated:: 0.19.0 - This parameter will be removed. Use `Series.struct.rename_fields` to - rename the field instead. - as_series - If set to `False`, return a DataFrame containing the original values, - the breakpoints, and the categories. - - .. deprecated:: 0.19.0 - This parameter will be removed. The same behavior can be achieved by - setting `include_breaks=True`, unnesting the resulting struct Series, - and adding the result to the original Series. - - Returns - ------- - Series - Series of data type :class:`Categorical` if `include_breaks` is set to - `False` (default), otherwise a Series of data type :class:`Struct`. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - See Also - -------- - cut - - Examples - -------- - Divide a column into three categories according to pre-defined quantile - probabilities. - - >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) - >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) - shape: (5,) - Series: \'foo\' [cat] - [ - "a" - "a" - "b" - "b" - "c" - ] - - Divide a column into two categories using uniform quantile probabilities. - - >>> s.qcut(2, labels=["low", "high"], left_closed=True) - shape: (5,) - Series: \'foo\' [cat] - [ - "low" - "low" - "high" - "high" - "high" - ] - - Create a DataFrame with the breakpoint and category for each value. - - >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") - >>> s.to_frame().with_columns(cut).unnest("cut") - shape: (5, 3) - ┌─────┬─────────────┬────────────┐ - │ foo ┆ break_point ┆ category │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ cat │ - ╞═════╪═════════════╪════════════╡ - │ -2 ┆ -1.0 ┆ (-inf, -1] │ - │ -1 ┆ -1.0 ┆ (-inf, -1] │ - │ 0 ┆ 1.0 ┆ (-1, 1] │ - │ 1 ┆ 1.0 ┆ (-1, 1] │ - │ 2 ┆ inf ┆ (1, inf] │ - └─────┴─────────────┴────────────┘ - - ''' - def rle(self) -> Series: - ''' - Get the lengths of runs of identical values. - - Returns - ------- - Series - Series of data type :class:`Struct` with Fields "lengths" and "values". - - Examples - -------- - >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) - >>> s.rle().struct.unnest() - shape: (6, 2) - ┌─────────┬────────┐ - │ lengths ┆ values │ - │ --- ┆ --- │ - │ i32 ┆ i64 │ - ╞═════════╪════════╡ - │ 2 ┆ 1 │ - │ 1 ┆ 2 │ - │ 1 ┆ 1 │ - │ 1 ┆ null │ - │ 1 ┆ 1 │ - │ 2 ┆ 3 │ - └─────────┴────────┘ - ''' - def rle_id(self) -> Series: - ''' - Map values to run IDs. - - Similar to RLE, but it maps each value to an ID corresponding to the run into - which it falls. This is especially useful when you want to define groups by - runs of identical values rather than the values themselves. - - Returns - ------- - Series - - See Also - -------- - rle - - Examples - -------- - >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) - >>> s.rle_id() - shape: (8,) - Series: \'s\' [u32] - [ - 0 - 0 - 1 - 2 - 3 - 4 - 5 - 5 - ] - ''' - def hist(self, bins: list[float] | None = ...) -> DataFrame: - ''' - Bin values into buckets and count their occurrences. - - Parameters - ---------- - bins - Discretizations to make. - If None given, we determine the boundaries based on the data. - bin_count - If no bins provided, this will be used to determine - the distance of the bins - - Returns - ------- - DataFrame - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - Examples - -------- - >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) - >>> a.hist(bin_count=4) - shape: (5, 3) - ┌─────────────┬─────────────┬─────────┐ - │ break_point ┆ category ┆ a_count │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ cat ┆ u32 │ - ╞═════════════╪═════════════╪═════════╡ - │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ - │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ - │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ - │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ - │ inf ┆ (6.75, inf] ┆ 2 │ - └─────────────┴─────────────┴─────────┘ - - ''' - def value_counts(self) -> DataFrame: - ''' - Count the occurrences of unique values. - - Parameters - ---------- - sort - Sort the output by count in descending order. - If set to `False` (default), the order of the output is random. - parallel - Execute the computation in parallel. - - .. note:: - This option should likely not be enabled in a group by context, - as the computation is already parallelized per group. - - Returns - ------- - DataFrame - Mapping of unique values to their count. - - Examples - -------- - >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) - >>> s.value_counts() # doctest: +IGNORE_RESULT - shape: (3, 2) - ┌───────┬────────┐ - │ color ┆ counts │ - │ --- ┆ --- │ - │ str ┆ u32 │ - ╞═══════╪════════╡ - │ red ┆ 2 │ - │ green ┆ 1 │ - │ blue ┆ 3 │ - └───────┴────────┘ - - Sort the output by count. - - shape: (3, 2) - ┌───────┬────────┐ - │ color ┆ counts │ - │ --- ┆ --- │ - │ str ┆ u32 │ - ╞═══════╪════════╡ - │ blue ┆ 3 │ - │ red ┆ 2 │ - │ green ┆ 1 │ - └───────┴────────┘ - - ''' - def unique_counts(self) -> Series: - ''' - Return a count of the unique values in the order of appearance. - - Examples - -------- - >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) - >>> s.unique_counts() - shape: (3,) - Series: \'id\' [u32] - [ - 1 - 2 - 3 - ] - - ''' - def entropy(self, base: float = ...) -> float | None: - """ - Computes the entropy. - - Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. - - Parameters - ---------- - base - Given base, defaults to `e` - normalize - Normalize pk if it doesn't sum to 1. - - Examples - -------- - >>> a = pl.Series([0.99, 0.005, 0.005]) - >>> a.entropy(normalize=True) - 0.06293300616044681 - >>> b = pl.Series([0.65, 0.10, 0.25]) - >>> b.entropy(normalize=True) - 0.8568409950394724 - - """ - def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: - ''' - Run an expression over a sliding window that increases `1` slot every iteration. - - Parameters - ---------- - expr - Expression to evaluate - min_periods - Number of valid values there should be in the window before the expression - is evaluated. valid values = `length - null_count` - parallel - Run in parallel. Don\'t do this in a group by or another operation that - already has much parallelization. - - Warnings - -------- - This functionality is experimental and may change without it being considered a - breaking change. - - This can be really slow as it can have `O(n^2)` complexity. Don\'t use this - for operations that visit all elements. - - Examples - -------- - >>> s = pl.Series("values", [1, 2, 3, 4, 5]) - >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) - shape: (5,) - Series: \'values\' [f64] - [ - 0.0 - -3.0 - -8.0 - -15.0 - -24.0 - ] - - ''' - def alias(self, name: str) -> Series: - ''' - Rename the series. - - Parameters - ---------- - name - The new name. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.alias("b") - shape: (3,) - Series: \'b\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def rename(self, name: str) -> Series: - ''' - Rename this Series. - - Alias for :func:`Series.alias`. - - Parameters - ---------- - name - New name. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.rename("b") - shape: (3,) - Series: \'b\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def chunk_lengths(self) -> list[int]: - ''' - Get the length of each individual chunk. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("a", [4, 5, 6]) - - Concatenate Series with rechunk = True - - >>> pl.concat([s, s2]).chunk_lengths() - [6] - - Concatenate Series with rechunk = False - - >>> pl.concat([s, s2], rechunk=False).chunk_lengths() - [3, 3] - - ''' - def n_chunks(self) -> int: - ''' - Get the number of chunks that this Series contains. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.n_chunks() - 1 - >>> s2 = pl.Series("a", [4, 5, 6]) - - Concatenate Series with rechunk = True - - >>> pl.concat([s, s2]).n_chunks() - 1 - - Concatenate Series with rechunk = False - - >>> pl.concat([s, s2], rechunk=False).n_chunks() - 2 - - ''' - def cum_max(self) -> Series: - ''' - Get an array with the cumulative max computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Examples - -------- - >>> s = pl.Series("s", [3, 5, 1]) - >>> s.cum_max() - shape: (3,) - Series: \'s\' [i64] - [ - 3 - 5 - 5 - ] - - ''' - def cum_min(self) -> Series: - ''' - Get an array with the cumulative min computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Examples - -------- - >>> s = pl.Series("s", [1, 2, 3]) - >>> s.cum_min() - shape: (3,) - Series: \'s\' [i64] - [ - 1 - 1 - 1 - ] - - ''' - def cum_prod(self) -> Series: - ''' - Get an array with the cumulative product computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.cum_prod() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 6 - ] - - ''' - def cum_sum(self) -> Series: - ''' - Get an array with the cumulative sum computed at every element. - - Parameters - ---------- - reverse - reverse the operation. - - Notes - ----- - Dtypes in {Int8, UInt8, Int16, UInt16} are cast to - Int64 before summing to prevent overflow issues. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.cum_sum() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 3 - 6 - ] - - ''' - def slice(self, offset: int, length: int | None = ...) -> Series: - ''' - Get a slice of this Series. - - Parameters - ---------- - offset - Start index. Negative indexing is supported. - length - Length of the slice. If set to `None`, all rows starting at the offset - will be selected. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4]) - >>> s.slice(1, 2) - shape: (2,) - Series: \'a\' [i64] - [ - 2 - 3 - ] - - ''' - def append(self, other: Series) -> Self: - ''' - Append a Series to this one. - - Parameters - ---------- - other - Series to append. - append_chunks - .. deprecated:: 0.18.8 - This argument will be removed and `append` will change to always - behave like `append_chunks=True` (the previous default). For the - behavior of `append_chunks=False`, use `Series.extend`. - - If set to `True` the append operation will add the chunks from `other` to - self. This is super cheap. - - If set to `False` the append operation will do the same as - `DataFrame.extend` which extends the memory backed by this `Series` with - the values from `other`. - - Different from `append chunks`, `extend` appends the data from `other` to - the underlying memory locations and thus may cause a reallocation (which are - expensive). - - If this does not cause a reallocation, the resulting data structure will not - have any extra chunks and thus will yield faster queries. - - Prefer `extend` over `append_chunks` when you want to do a query after a - single append. For instance during online operations where you add `n` rows - and rerun a query. - - Prefer `append_chunks` over `extend` when you want to append many times - before doing a query. For instance when you read in multiple files and when - to store them in a single `Series`. In the latter case, finish the sequence - of `append_chunks` operations with a `rechunk`. - - Warnings - -------- - This method modifies the series in-place. The series is returned for - convenience only. - - See Also - -------- - extend - - Examples - -------- - >>> a = pl.Series("a", [1, 2, 3]) - >>> b = pl.Series("b", [4, 5]) - >>> a.append(b) - shape: (5,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - 5 - ] - - The resulting series will consist of multiple chunks. - - >>> a.n_chunks() - 2 - - ''' - def extend(self, other: Series) -> Self: - ''' - Extend the memory backed by this Series with the values from another. - - Different from `append`, which adds the chunks from `other` to the chunks of - this series, `extend` appends the data from `other` to the underlying memory - locations and thus may cause a reallocation (which is expensive). - - If this does `not` cause a reallocation, the resulting data structure will not - have any extra chunks and thus will yield faster queries. - - Prefer `extend` over `append` when you want to do a query after a single - append. For instance, during online operations where you add `n` rows - and rerun a query. - - Prefer `append` over `extend` when you want to append many times - before doing a query. For instance, when you read in multiple files and want - to store them in a single `Series`. In the latter case, finish the sequence - of `append` operations with a `rechunk`. - - Parameters - ---------- - other - Series to extend the series with. - - Warnings - -------- - This method modifies the series in-place. The series is returned for - convenience only. - - See Also - -------- - append - - Examples - -------- - >>> a = pl.Series("a", [1, 2, 3]) - >>> b = pl.Series("b", [4, 5]) - >>> a.extend(b) - shape: (5,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - 5 - ] - - The resulting series will consist of a single chunk. - - >>> a.n_chunks() - 1 - - ''' - def filter(self, predicate: Series | list[bool]) -> Self: - ''' - Filter elements by a boolean mask. - - The original order of the remaining elements is preserved. - - Parameters - ---------- - predicate - Boolean mask. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> mask = pl.Series("", [True, False, True]) - >>> s.filter(mask) - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 3 - ] - - ''' - def head(self, n: int = ...) -> Series: - ''' - Get the first `n` elements. - - Parameters - ---------- - n - Number of elements to return. If a negative value is passed, return all - elements except the last `abs(n)`. - - See Also - -------- - tail, slice - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.head(3) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - Pass a negative value to get all rows `except` the last `abs(n)`. - - >>> s.head(-3) - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 2 - ] - - ''' - def tail(self, n: int = ...) -> Series: - ''' - Get the last `n` elements. - - Parameters - ---------- - n - Number of elements to return. If a negative value is passed, return all - elements except the first `abs(n)`. - - See Also - -------- - head, slice - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.tail(3) - shape: (3,) - Series: \'a\' [i64] - [ - 3 - 4 - 5 - ] - - Pass a negative value to get all rows `except` the first `abs(n)`. - - >>> s.tail(-3) - shape: (2,) - Series: \'a\' [i64] - [ - 4 - 5 - ] - - ''' - def limit(self, n: int = ...) -> Series: - """ - Get the first `n` elements. - - Alias for :func:`Series.head`. - - Parameters - ---------- - n - Number of elements to return. If a negative value is passed, return all - elements except the last `abs(n)`. - - See Also - -------- - head - - """ - def gather_every(self, n: int) -> Series: - ''' - Take every nth value in the Series and return as new Series. - - Parameters - ---------- - n - Gather every *n*-th row. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4]) - >>> s.gather_every(2) - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 3 - ] - - ''' - def sort(self) -> Self: - ''' - Sort this Series. - - Parameters - ---------- - descending - Sort in descending order. - in_place - Sort in-place. - - Examples - -------- - >>> s = pl.Series("a", [1, 3, 4, 2]) - >>> s.sort() - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 4 - ] - >>> s.sort(descending=True) - shape: (4,) - Series: \'a\' [i64] - [ - 4 - 3 - 2 - 1 - ] - - ''' - def top_k(self, k: int | IntoExprColumn = ...) -> Series: - ''' - Return the `k` largest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - bottom_k - - Examples - -------- - >>> s = pl.Series("a", [2, 5, 1, 4, 3]) - >>> s.top_k(3) - shape: (3,) - Series: \'a\' [i64] - [ - 5 - 4 - 3 - ] - - ''' - def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: - ''' - Return the `k` smallest elements. - - This has time complexity: - - .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) - - Parameters - ---------- - k - Number of elements to return. - - See Also - -------- - top_k - - Examples - -------- - >>> s = pl.Series("a", [2, 5, 1, 4, 3]) - >>> s.bottom_k(3) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def arg_sort(self) -> Series: - ''' - Get the index values that would sort this Series. - - Parameters - ---------- - descending - Sort in descending order. - nulls_last - Place null values last instead of first. - - Examples - -------- - >>> s = pl.Series("a", [5, 3, 4, 1, 2]) - >>> s.arg_sort() - shape: (5,) - Series: \'a\' [u32] - [ - 3 - 4 - 1 - 2 - 0 - ] - - ''' - def arg_unique(self) -> Series: - ''' - Get unique index as Series. - - Returns - ------- - Series - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.arg_unique() - shape: (3,) - Series: \'a\' [u32] - [ - 0 - 1 - 3 - ] - - ''' - def arg_min(self) -> int | None: - ''' - Get the index of the minimal value. - - Returns - ------- - int - - Examples - -------- - >>> s = pl.Series("a", [3, 2, 1]) - >>> s.arg_min() - 2 - - ''' - def arg_max(self) -> int | None: - ''' - Get the index of the maximal value. - - Returns - ------- - int - - Examples - -------- - >>> s = pl.Series("a", [3, 2, 1]) - >>> s.arg_max() - 0 - - ''' - def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: - """ - Find indices where elements should be inserted to maintain order. - - .. math:: a[i-1] < v <= a[i] - - Parameters - ---------- - element - Expression or scalar value. - side : {'any', 'left', 'right'} - If 'any', the index of the first suitable location found is given. - If 'left', the index of the leftmost suitable location found is given. - If 'right', return the rightmost suitable location found is given. - - """ - def unique(self) -> Series: - ''' - Get unique elements in series. - - Parameters - ---------- - maintain_order - Maintain order of data. This requires more work. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.unique().sort() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: - ''' - Take values by index. - - Parameters - ---------- - indices - Index location used for selection. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4]) - >>> s.gather([1, 3]) - shape: (2,) - Series: \'a\' [i64] - [ - 2 - 4 - ] - - ''' - def null_count(self) -> int: - """Count the null values in this Series.""" - def has_validity(self) -> bool: - """ - Return True if the Series has a validity bitmask. - - If there is no mask, it means that there are no `null` values. - - Notes - ----- - While the *absence* of a validity bitmask guarantees that a Series does not - have `null` values, the converse is not true, eg: the *presence* of a - bitmask does not mean that there are null values, as every value of the - bitmask could be `false`. - - To confirm that a column has `null` values use :func:`null_count`. - - """ - def is_empty(self) -> bool: - ''' - Check if the Series is empty. - - Examples - -------- - >>> s = pl.Series("a", [], dtype=pl.Float32) - >>> s.is_empty() - True - - ''' - def is_sorted(self) -> bool: - """ - Check if the Series is sorted. - - Parameters - ---------- - descending - Check if the Series is sorted in descending order - - """ - def not_(self) -> Series: - ''' - Negate a boolean Series. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [True, False, False]) - >>> s.not_() - shape: (3,) - Series: \'a\' [bool] - [ - false - true - true - ] - - ''' - def is_null(self) -> Series: - ''' - Returns a boolean Series indicating which values are null. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) - >>> s.is_null() - shape: (4,) - Series: \'a\' [bool] - [ - false - false - false - true - ] - - ''' - def is_not_null(self) -> Series: - ''' - Returns a boolean Series indicating which values are not null. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) - >>> s.is_not_null() - shape: (4,) - Series: \'a\' [bool] - [ - true - true - true - false - ] - - ''' - def is_finite(self) -> Series: - ''' - Returns a boolean Series indicating which values are finite. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, np.inf]) - >>> s.is_finite() - shape: (3,) - Series: \'a\' [bool] - [ - true - true - false - ] - - ''' - def is_infinite(self) -> Series: - ''' - Returns a boolean Series indicating which values are infinite. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, np.inf]) - >>> s.is_infinite() - shape: (3,) - Series: \'a\' [bool] - [ - false - false - true - ] - - ''' - def is_nan(self) -> Series: - ''' - Returns a boolean Series indicating which values are not NaN. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) - >>> s.is_nan() - shape: (4,) - Series: \'a\' [bool] - [ - false - false - false - true - ] - - ''' - def is_not_nan(self) -> Series: - ''' - Returns a boolean Series indicating which values are not NaN. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> import numpy as np - >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) - >>> s.is_not_nan() - shape: (4,) - Series: \'a\' [bool] - [ - true - true - true - false - ] - - ''' - def is_in(self, other: Series | Collection[Any]) -> Series: - ''' - Check if elements of this Series are in the other Series. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("b", [2, 4]) - >>> s2.is_in(s) - shape: (2,) - Series: \'b\' [bool] - [ - true - false - ] - - >>> # check if some values are a member of sublists - >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) - >>> optional_members = pl.Series("optional_members", [1, 2, 3]) - >>> print(sets) - shape: (3,) - Series: \'sets\' [list[i64]] - [ - [1, 2, 3] - [1, 2] - [9, 10] - ] - >>> print(optional_members) - shape: (3,) - Series: \'optional_members\' [i64] - [ - 1 - 2 - 3 - ] - >>> optional_members.is_in(sets) - shape: (3,) - Series: \'optional_members\' [bool] - [ - true - true - false - ] - - ''' - def arg_true(self) -> Series: - ''' - Get index values where Boolean Series evaluate True. - - Returns - ------- - Series - Series of data type :class:`UInt32`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> (s == 2).arg_true() - shape: (1,) - Series: \'a\' [u32] - [ - 1 - ] - - ''' - def is_unique(self) -> Series: - ''' - Get mask of all unique values. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.is_unique() - shape: (4,) - Series: \'a\' [bool] - [ - true - false - false - true - ] - - ''' - def is_first_distinct(self) -> Series: - """ - Return a boolean mask indicating the first occurrence of each distinct value. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series([1, 1, 2, 3, 2]) - >>> s.is_first_distinct() - shape: (5,) - Series: '' [bool] - [ - true - false - true - true - false - ] - - """ - def is_last_distinct(self) -> Series: - """ - Return a boolean mask indicating the last occurrence of each distinct value. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series([1, 1, 2, 3, 2]) - >>> s.is_last_distinct() - shape: (5,) - Series: '' [bool] - [ - false - true - false - true - true - ] - - """ - def is_duplicated(self) -> Series: - ''' - Get mask of all duplicated values. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.is_duplicated() - shape: (4,) - Series: \'a\' [bool] - [ - false - true - true - false - ] - - ''' - def explode(self) -> Series: - """ - Explode a list Series. - - This means that every item is expanded to a new row. - - Returns - ------- - Series - Series with the data type of the list elements. - - See Also - -------- - Series.list.explode : Explode a list column. - Series.str.explode : Explode a string column. - - """ - def equals(self, other: Series) -> bool: - ''' - Check whether the Series is equal to another Series. - - Parameters - ---------- - other - Series to compare with. - null_equal - Consider null values as equal. - strict - Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a - `pl.Int64` will return `False`. - - See Also - -------- - assert_series_equal - - Examples - -------- - >>> s1 = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("b", [4, 5, 6]) - >>> s1.equals(s1) - True - >>> s1.equals(s2) - False - ''' - def len(self) -> int: - ''' - Return the number of elements in this Series. - - Null values are treated like regular elements in this context. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, None]) - >>> s.len() - 3 - - ''' - def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: - ''' - Cast between data types. - - Parameters - ---------- - dtype - DataType to cast to. - strict - Throw an error if a cast could not be done (for instance, due to an - overflow). - - Examples - -------- - >>> s = pl.Series("a", [True, False, True]) - >>> s - shape: (3,) - Series: \'a\' [bool] - [ - true - false - true - ] - - >>> s.cast(pl.UInt32) - shape: (3,) - Series: \'a\' [u32] - [ - 1 - 0 - 1 - ] - - ''' - def to_physical(self) -> Series: - ''' - Cast to physical representation of the logical dtype. - - - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` - - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` - - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` - - `List(inner)` -> `List(physical of inner)` - - Other data types will be left unchanged. - - Examples - -------- - Replicating the pandas - `pd.Series.factorize - `_ - method. - - >>> s = pl.Series("values", ["a", None, "x", "a"]) - >>> s.cast(pl.Categorical).to_physical() - shape: (4,) - Series: \'values\' [u32] - [ - 0 - null - 1 - 0 - ] - - ''' - def to_list(self) -> list[Any]: - ''' - Convert this Series to a Python List. This operation clones data. - - Parameters - ---------- - use_pyarrow - Use pyarrow for the conversion. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.to_list() - [1, 2, 3] - >>> type(s.to_list()) - - - ''' - def rechunk(self) -> Self: - """ - Create a single chunk of memory for this Series. - - Parameters - ---------- - in_place - In place or not. - - """ - def reverse(self) -> Series: - ''' - Return Series in reverse order. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) - >>> s.reverse() - shape: (3,) - Series: \'a\' [i8] - [ - 3 - 2 - 1 - ] - - ''' - def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: - ''' - Get a boolean mask of the values that fall between the given start/end values. - - Parameters - ---------- - lower_bound - Lower bound value. Accepts expression input. Non-expression inputs - (including strings) are parsed as literals. - upper_bound - Upper bound value. Accepts expression input. Non-expression inputs - (including strings) are parsed as literals. - closed : {\'both\', \'left\', \'right\', \'none\'} - Define which sides of the interval are closed (inclusive). - - Examples - -------- - >>> s = pl.Series("num", [1, 2, 3, 4, 5]) - >>> s.is_between(2, 4) - shape: (5,) - Series: \'num\' [bool] - [ - false - true - true - true - false - ] - - Use the `closed` argument to include or exclude the values at the bounds: - - >>> s.is_between(2, 4, closed="left") - shape: (5,) - Series: \'num\' [bool] - [ - false - true - true - false - false - ] - - You can also use strings as well as numeric/temporal values: - - >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) - >>> s.is_between("b", "d", closed="both") - shape: (5,) - Series: \'s\' [bool] - [ - false - true - true - true - false - ] - - ''' - def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: - ''' - Convert this Series to numpy. - - This operation may clone data but is completely safe. Note that: - - - data which is purely numeric AND without null values is not cloned; - - floating point `nan` values can be zero-copied; - - booleans can\'t be zero-copied. - - To ensure that no data is cloned, set `zero_copy_only=True`. - - Parameters - ---------- - *args - args will be sent to pyarrow.Array.to_numpy. - zero_copy_only - If True, an exception will be raised if the conversion to a numpy - array would require copying the underlying data (e.g. in presence - of nulls, or for non-primitive types). - writable - For numpy arrays created with zero copy (view on the Arrow data), - the resulting array is not writable (Arrow data is immutable). - By setting this to True, a copy of the array is made to ensure - it is writable. - use_pyarrow - Use `pyarrow.Array.to_numpy - `_ - - for the conversion to numpy. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> arr = s.to_numpy() - >>> arr # doctest: +IGNORE_RESULT - array([1, 2, 3], dtype=int64) - >>> type(arr) - - - ''' - def _view(self) -> SeriesView: - ''' - Get a view into this Series data with a numpy array. - - This operation doesn\'t clone data, but does not include missing values. - - Returns - ------- - SeriesView - - Parameters - ---------- - ignore_nulls - If True then nulls are converted to 0. - If False then an Exception is raised if nulls are present. - - Examples - -------- - >>> s = pl.Series("a", [1, None]) - >>> s._view(ignore_nulls=True) - SeriesView([1, 0]) - - ''' - def to_arrow(self) -> pa.Array: - ''' - Get the underlying Arrow Array. - - If the Series contains only a single chunk this operation is zero copy. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s = s.to_arrow() - >>> s # doctest: +ELLIPSIS - - [ - 1, - 2, - 3 - ] - - ''' - def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: - ''' - Convert this Series to a pandas Series. - - This requires that :mod:`pandas` and :mod:`pyarrow` are installed. - This operation clones data, unless `use_pyarrow_extension_array=True`. - - Parameters - ---------- - use_pyarrow_extension_array - Further operations on this Pandas series, might trigger conversion to numpy. - Use PyArrow backed-extension array instead of numpy array for pandas - Series. This allows zero copy operations and preservation of nulls - values. - Further operations on this pandas Series, might trigger conversion - to NumPy arrays if that operation is not supported by pyarrow compute - functions. - kwargs - Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. - - Examples - -------- - >>> s1 = pl.Series("a", [1, 2, 3]) - >>> s1.to_pandas() - 0 1 - 1 2 - 2 3 - Name: a, dtype: int64 - >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP - 0 1 - 1 2 - 2 3 - Name: a, dtype: int64[pyarrow] - >>> s2 = pl.Series("b", [1, 2, None, 4]) - >>> s2.to_pandas() - 0 1.0 - 1 2.0 - 2 NaN - 3 4.0 - Name: b, dtype: float64 - >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP - 0 1 - 1 2 - 2 - 3 4 - Name: b, dtype: int64[pyarrow] - - ''' - def to_init_repr(self, n: int = ...) -> str: - ''' - Convert Series to instantiatable string representation. - - Parameters - ---------- - n - Only use first n elements. - - See Also - -------- - polars.Series.to_init_repr - polars.from_repr - - Examples - -------- - >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) - >>> print(s.to_init_repr()) - pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) - >>> s_from_str_repr = eval(s.to_init_repr()) - >>> s_from_str_repr - shape: (4,) - Series: \'a\' [i16] - [ - 1 - 2 - null - 4 - ] - - ''' - def set(self, filter: Series, value: int | float | str | bool | None) -> Series: - ''' - Set masked values. - - Parameters - ---------- - filter - Boolean mask. - value - Value with which to replace the masked values. - - Notes - ----- - Use of this function is frequently an anti-pattern, as it can - block optimisation (predicate pushdown, etc). Consider using - `pl.when(predicate).then(value).otherwise(self)` instead. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.set(s == 2, 10) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 10 - 3 - ] - - It is better to implement this as follows: - - >>> s.to_frame().select( - ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) - ... ) - shape: (3, 1) - ┌─────────┐ - │ literal │ - │ --- │ - │ i64 │ - ╞═════════╡ - │ 1 │ - │ 10 │ - │ 3 │ - └─────────┘ - - ''' - def scatter(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: - ''' - Set values at the index locations. - - Parameters - ---------- - indices - Integers representing the index locations. - values - Replacement values. - - Notes - ----- - Use of this function is frequently an anti-pattern, as it can - block optimization (predicate pushdown, etc). Consider using - `pl.when(predicate).then(value).otherwise(self)` instead. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.scatter(1, 10) - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 10 - 3 - ] - - It is better to implement this as follows: - - >>> s.to_frame().with_row_count("row_nr").select( - ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) - ... ) - shape: (3, 1) - ┌─────────┐ - │ literal │ - │ --- │ - │ i64 │ - ╞═════════╡ - │ 1 │ - │ 10 │ - │ 3 │ - └─────────┘ - - ''' - def clear(self, n: int = ...) -> Series: - ''' - Create an empty copy of the current Series, with zero to \'n\' elements. - - The copy has an identical name/dtype, but no data. - - Parameters - ---------- - n - Number of (empty) elements to return in the cleared frame. - - See Also - -------- - clone : Cheap deepcopy/clone. - - Examples - -------- - >>> s = pl.Series("a", [None, True, False]) - >>> s.clear() - shape: (0,) - Series: \'a\' [bool] - [ - ] - - >>> s.clear(n=2) - shape: (2,) - Series: \'a\' [bool] - [ - null - null - ] - - ''' - def clone(self) -> Self: - ''' - Create a copy of this Series. - - This is a cheap operation that does not copy data. - - See Also - -------- - clear : Create an empty copy of the current Series, with identical - schema but no data. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.clone() - shape: (3,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - ] - - ''' - def fill_nan(self, value: int | float | Expr | None) -> Series: - ''' - Fill floating point NaN value with a fill value. - - Parameters - ---------- - value - Value used to fill NaN values. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, float("nan")]) - >>> s.fill_nan(0) - shape: (4,) - Series: \'a\' [f64] - [ - 1.0 - 2.0 - 3.0 - 0.0 - ] - - ''' - def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: - ''' - Fill null values using the specified value or strategy. - - Parameters - ---------- - value - Value used to fill null values. - strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} - Strategy used to fill null values. - limit - Number of consecutive null values to fill when using the \'forward\' or - \'backward\' strategy. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, None]) - >>> s.fill_null(strategy="forward") - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 3 - ] - >>> s.fill_null(strategy="min") - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 2 - 3 - 1 - ] - >>> s = pl.Series("b", ["x", None, "z"]) - >>> s.fill_null(pl.lit("")) - shape: (3,) - Series: \'b\' [str] - [ - "x" - "" - "z" - ] - - ''' - def floor(self) -> Series: - ''' - Rounds down to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) - >>> s.floor() - shape: (3,) - Series: \'a\' [f64] - [ - 1.0 - 2.0 - 3.0 - ] - - ''' - def ceil(self) -> Series: - ''' - Rounds up to the nearest integer value. - - Only works on floating point Series. - - Examples - -------- - >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) - >>> s.ceil() - shape: (3,) - Series: \'a\' [f64] - [ - 2.0 - 3.0 - 4.0 - ] - - ''' - def round(self, decimals: int = ...) -> Series: - ''' - Round underlying floating point data by `decimals` digits. - - Examples - -------- - >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) - >>> s.round(2) - shape: (3,) - Series: \'a\' [f64] - [ - 1.12 - 2.57 - 3.9 - ] - - Parameters - ---------- - decimals - number of decimals to round by. - - ''' - def round_sig_figs(self, digits: int) -> Series: - """ - Round to a number of significant figures. - - Parameters - ---------- - digits - Number of significant figures to round to. - - Examples - -------- - >>> s = pl.Series([0.01234, 3.333, 1234.0]) - >>> s.round_sig_figs(2) - shape: (3,) - Series: '' [f64] - [ - 0.012 - 3.3 - 1200.0 - ] - - """ - def dot(self, other: Series | ArrayLike) -> float | None: - ''' - Compute the dot/inner product between two Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) - >>> s.dot(s2) - 32.0 - - Parameters - ---------- - other - Series (or array) to compute dot product with. - - ''' - def mode(self) -> Series: - ''' - Compute the most occurring value(s). - - Can return multiple Values. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.mode() - shape: (1,) - Series: \'a\' [i64] - [ - 2 - ] - - ''' - def sign(self) -> Series: - ''' - Compute the element-wise indication of the sign. - - The returned values can be -1, 0, or 1: - - * -1 if x < 0. - * 0 if x == 0. - * 1 if x > 0. - - (null values are preserved as-is). - - Examples - -------- - >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) - >>> s.sign() - shape: (5,) - Series: \'a\' [i64] - [ - -1 - 0 - 0 - 1 - null - ] - - ''' - def sin(self) -> Series: - ''' - Compute the element-wise value for the sine. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.sin() - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 1.0 - 1.2246e-16 - ] - - ''' - def cos(self) -> Series: - ''' - Compute the element-wise value for the cosine. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.cos() - shape: (3,) - Series: \'a\' [f64] - [ - 1.0 - 6.1232e-17 - -1.0 - ] - - ''' - def tan(self) -> Series: - ''' - Compute the element-wise value for the tangent. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.tan() - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 1.6331e16 - -1.2246e-16 - ] - - ''' - def cot(self) -> Series: - ''' - Compute the element-wise value for the cotangent. - - Examples - -------- - >>> import math - >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) - >>> s.cot() - shape: (3,) - Series: \'a\' [f64] - [ - inf - 6.1232e-17 - -8.1656e15 - ] - - ''' - def arcsin(self) -> Series: - ''' - Compute the element-wise value for the inverse sine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arcsin() - shape: (3,) - Series: \'a\' [f64] - [ - 1.570796 - 0.0 - -1.570796 - ] - - ''' - def arccos(self) -> Series: - ''' - Compute the element-wise value for the inverse cosine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arccos() - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 1.570796 - 3.141593 - ] - - ''' - def arctan(self) -> Series: - ''' - Compute the element-wise value for the inverse tangent. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arctan() - shape: (3,) - Series: \'a\' [f64] - [ - 0.785398 - 0.0 - -0.785398 - ] - - ''' - def arcsinh(self) -> Series: - ''' - Compute the element-wise value for the inverse hyperbolic sine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.arcsinh() - shape: (3,) - Series: \'a\' [f64] - [ - 0.881374 - 0.0 - -0.881374 - ] - - ''' - def arccosh(self) -> Series: - ''' - Compute the element-wise value for the inverse hyperbolic cosine. - - Examples - -------- - >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) - >>> s.arccosh() - shape: (4,) - Series: \'a\' [f64] - [ - 2.292432 - 0.0 - NaN - NaN - ] - - ''' - def arctanh(self) -> Series: - ''' - Compute the element-wise value for the inverse hyperbolic tangent. - - Examples - -------- - >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) - >>> s.arctanh() - shape: (7,) - Series: \'a\' [f64] - [ - NaN - inf - 0.549306 - 0.0 - -0.549306 - -inf - NaN - ] - - ''' - def sinh(self) -> Series: - ''' - Compute the element-wise value for the hyperbolic sine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.sinh() - shape: (3,) - Series: \'a\' [f64] - [ - 1.175201 - 0.0 - -1.175201 - ] - - ''' - def cosh(self) -> Series: - ''' - Compute the element-wise value for the hyperbolic cosine. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.cosh() - shape: (3,) - Series: \'a\' [f64] - [ - 1.543081 - 1.0 - 1.543081 - ] - - ''' - def tanh(self) -> Series: - ''' - Compute the element-wise value for the hyperbolic tangent. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 0.0, -1.0]) - >>> s.tanh() - shape: (3,) - Series: \'a\' [f64] - [ - 0.761594 - 0.0 - -0.761594 - ] - - ''' - def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - ''' - Map a custom/user-defined function (UDF) over elements in this Series. - - .. warning:: - This method is much slower than the native expressions API. - Only use it if you cannot implement your logic otherwise. - - If the function returns a different datatype, the return_dtype arg should - be set, otherwise the method will fail. - - Implementing logic using a Python function is almost always *significantly* - slower and more memory intensive than implementing the same logic using - the native expression API because: - - - The native expression engine runs in Rust; UDFs run in Python. - - Use of Python UDFs forces the DataFrame to be materialized in memory. - - Polars-native expressions can be parallelised (UDFs typically cannot). - - Polars-native expressions can be logically optimised (UDFs cannot). - - Wherever possible you should strongly prefer the native expression API - to achieve the best performance. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output datatype. If none is given, the same datatype as this Series will be - used. - skip_nulls - Nulls will be skipped and not passed to the python function. - This is faster because python can be skipped and because we call - more specialized functions. - - Warnings - -------- - If `return_dtype` is not provided, this may lead to unexpected results. - We allow this, but it is considered a bug in the user\'s query. - - Notes - ----- - If your function is expensive and you don\'t want it to be called more than - once for a given input, consider applying an `@lru_cache` decorator to it. - If your data is suitable you may achieve *significant* speedups. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP - shape: (3,) - Series: \'a\' [i64] - [ - 11 - 12 - 13 - ] - - Returns - ------- - Series - - ''' - def shift(self, n: int = ...) -> Series: - """ - Shift values by the given number of indices. - - Parameters - ---------- - n - Number of indices to shift forward. If a negative value is passed, values - are shifted in the opposite direction instead. - fill_value - Fill the resulting null values with this value. Accepts expression input. - Non-expression inputs are parsed as literals. - - Notes - ----- - This method is similar to the `LAG` operation in SQL when the value for `n` - is positive. With a negative value for `n`, it is similar to `LEAD`. - - Examples - -------- - By default, values are shifted forward by one index. - - >>> s = pl.Series([1, 2, 3, 4]) - >>> s.shift() - shape: (4,) - Series: '' [i64] - [ - null - 1 - 2 - 3 - ] - - Pass a negative value to shift in the opposite direction instead. - - >>> s.shift(-2) - shape: (4,) - Series: '' [i64] - [ - 3 - 4 - null - null - ] - - Specify `fill_value` to fill the resulting null values. - - >>> s.shift(-2, fill_value=100) - shape: (4,) - Series: '' [i64] - [ - 3 - 4 - 100 - 100 - ] - - """ - def zip_with(self, mask: Series, other: Series) -> Self: - """ - Take values from self or other based on the given mask. - - Where mask evaluates true, take values from self. Where mask evaluates false, - take values from other. - - Parameters - ---------- - mask - Boolean Series. - other - Series of same type. - - Returns - ------- - Series - - Examples - -------- - >>> s1 = pl.Series([1, 2, 3, 4, 5]) - >>> s2 = pl.Series([5, 4, 3, 2, 1]) - >>> s1.zip_with(s1 < s2, s2) - shape: (5,) - Series: '' [i64] - [ - 1 - 2 - 3 - 2 - 1 - ] - >>> mask = pl.Series([True, False, True, False, True]) - >>> s1.zip_with(mask, s2) - shape: (5,) - Series: '' [i64] - [ - 1 - 4 - 3 - 2 - 5 - ] - - """ - def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling min (moving min) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their min. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [100, 200, 300, 400, 500]) - >>> s.rolling_min(window_size=3) - shape: (5,) - Series: \'a\' [i64] - [ - null - null - 100 - 200 - 300 - ] - - ''' - def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling max (moving max) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their max. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [100, 200, 300, 400, 500]) - >>> s.rolling_max(window_size=2) - shape: (5,) - Series: \'a\' [i64] - [ - null - 200 - 300 - 400 - 500 - ] - - ''' - def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling mean (moving mean) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their mean. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [100, 200, 300, 400, 500]) - >>> s.rolling_mean(window_size=2) - shape: (5,) - Series: \'a\' [f64] - [ - null - 150.0 - 250.0 - 350.0 - 450.0 - ] - - ''' - def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Apply a rolling sum (moving sum) over the values in this array. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their sum. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length of the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.rolling_sum(window_size=2) - shape: (5,) - Series: \'a\' [i64] - [ - null - 3 - 5 - 7 - 9 - ] - - ''' - def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling std dev. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their std dev. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_std(window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.0 - 1.0 - 1.527525 - 2.0 - ] - - ''' - def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling variance. - - A window of length `window_size` will traverse the array. The values that fill - this window will (optionally) be multiplied with the weights given by the - `weight` vector. The resulting values will be aggregated to their variance. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - ddof - "Delta Degrees of Freedom": The divisor for a length N window is N - ddof - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_var(window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.0 - 1.0 - 2.333333 - 4.0 - ] - - ''' - def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a custom rolling window function. - - .. warning:: - Computing custom functions is extremely slow. Use specialized rolling - functions such as :func:`Series.rolling_sum` if at all possible. - - Parameters - ---------- - function - Custom aggregation function. - window_size - Size of the window. The window at a given row will include the row - itself and the `window_size - 1` elements before it. - weights - A list of weights with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window. - - Warnings - -------- - - - Examples - -------- - >>> from numpy import nansum - >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) - >>> s.rolling_map(nansum, window_size=3) - shape: (5,) - Series: \'\' [f64] - [ - null - null - 22.0 - 11.0 - 17.0 - ] - - ''' - def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling median. - - Parameters - ---------- - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_median(window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 2.0 - 3.0 - 4.0 - 6.0 - ] - - ''' - def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - ''' - Compute a rolling quantile. - - The window at a given row will include the row itself and the `window_size - 1` - elements before it. - - Parameters - ---------- - quantile - Quantile between 0.0 and 1.0. - interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} - Interpolation method. - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) - >>> s.rolling_quantile(quantile=0.33, window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.0 - 2.0 - 3.0 - 4.0 - ] - >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) - shape: (6,) - Series: \'a\' [f64] - [ - null - null - 1.66 - 2.66 - 3.66 - 5.32 - ] - - ''' - def rolling_skew(self, window_size: int) -> Series: - """ - Compute a rolling skew. - - The window at a given row includes the row itself and the - `window_size - 1` elements before it. - - Parameters - ---------- - window_size - Integer size of the rolling window. - bias - If False, the calculations are corrected for statistical bias. - - Examples - -------- - >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) - shape: (4,) - Series: '' [f64] - [ - null - null - 0.381802 - 0.47033 - ] - - Note how the values match - - >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() - (0.38180177416060584, 0.47033046033698594) - - """ - def sample(self, n: int | None = ...) -> Series: - ''' - Sample from this Series. - - Parameters - ---------- - n - Number of items to return. Cannot be used with `fraction`. Defaults to 1 if - `fraction` is None. - fraction - Fraction of items to return. Cannot be used with `n`. - with_replacement - Allow values to be sampled more than once. - shuffle - Shuffle the order of sampled data points. - seed - Seed for the random number generator. If set to None (default), a - random seed is generated for each sample operation. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT - shape: (2,) - Series: \'a\' [i64] - [ - 1 - 5 - ] - - ''' - def peak_max(self) -> Self: - ''' - Get a boolean mask of the local maximum peaks. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3, 4, 5]) - >>> s.peak_max() - shape: (5,) - Series: \'a\' [bool] - [ - false - false - false - false - true - ] - - ''' - def peak_min(self) -> Self: - ''' - Get a boolean mask of the local minimum peaks. - - Examples - -------- - >>> s = pl.Series("a", [4, 1, 3, 2, 5]) - >>> s.peak_min() - shape: (5,) - Series: \'a\' [bool] - [ - false - true - false - true - false - ] - - ''' - def n_unique(self) -> int: - ''' - Count the number of unique values in this Series. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.n_unique() - 3 - - ''' - def shrink_to_fit(self) -> Series: - """ - Shrink Series memory usage. - - Shrinks the underlying array capacity to exactly fit the actual data. - (Note that this function does not change the Series data type). - - """ - def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: - ''' - Hash the Series. - - The hash value is of type `UInt64`. - - Parameters - ---------- - seed - Random seed parameter. Defaults to 0. - seed_1 - Random seed parameter. Defaults to `seed` if not set. - seed_2 - Random seed parameter. Defaults to `seed` if not set. - seed_3 - Random seed parameter. Defaults to `seed` if not set. - - Notes - ----- - This implementation of :func:`hash` does not guarantee stable results - across different Polars versions. Its stability is only guaranteed within a - single version. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.hash(seed=42) # doctest: +IGNORE_RESULT - shape: (3,) - Series: \'a\' [u64] - [ - 10734580197236529959 - 3022416320763508302 - 13756996518000038261 - ] - - ''' - def reinterpret(self) -> Series: - """ - Reinterpret the underlying bits as a signed/unsigned integer. - - This operation is only allowed for 64bit integers. For lower bits integers, - you can safely use that cast operation. - - Parameters - ---------- - signed - If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. - - """ - def interpolate(self, method: InterpolationMethod = ...) -> Series: - ''' - Fill null values using interpolation. - - Parameters - ---------- - method : {\'linear\', \'nearest\'} - Interpolation method. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, None, None, 5]) - >>> s.interpolate() - shape: (5,) - Series: \'a\' [f64] - [ - 1.0 - 2.0 - 3.0 - 4.0 - 5.0 - ] - - ''' - def abs(self) -> Series: - """ - Compute absolute values. - - Same as `abs(series)`. - """ - def rank(self, method: RankMethod = ...) -> Series: - ''' - Assign ranks to data, dealing with ties appropriately. - - Parameters - ---------- - method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} - The method used to assign ranks to tied elements. - The following methods are available (default is \'average\'): - - - \'average\' : The average of the ranks that would have been assigned to - all the tied values is assigned to each value. - - \'min\' : The minimum of the ranks that would have been assigned to all - the tied values is assigned to each value. (This is also referred to - as "competition" ranking.) - - \'max\' : The maximum of the ranks that would have been assigned to all - the tied values is assigned to each value. - - \'dense\' : Like \'min\', but the rank of the next highest element is - assigned the rank immediately after those assigned to the tied - elements. - - \'ordinal\' : All values are given a distinct rank, corresponding to - the order that the values occur in the Series. - - \'random\' : Like \'ordinal\', but the rank for ties is not dependent - on the order that the values occur in the Series. - descending - Rank in descending order. - seed - If `method="random"`, use this as seed. - - Examples - -------- - The \'average\' method: - - >>> s = pl.Series("a", [3, 6, 1, 1, 6]) - >>> s.rank() - shape: (5,) - Series: \'a\' [f64] - [ - 3.0 - 4.5 - 1.5 - 1.5 - 4.5 - ] - - The \'ordinal\' method: - - >>> s = pl.Series("a", [3, 6, 1, 1, 6]) - >>> s.rank("ordinal") - shape: (5,) - Series: \'a\' [u32] - [ - 3 - 4 - 1 - 2 - 5 - ] - - ''' - def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: - ''' - Calculate the first discrete difference between shifted items. - - Parameters - ---------- - n - Number of slots to shift. - null_behavior : {\'ignore\', \'drop\'} - How to handle null values. - - Examples - -------- - >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) - >>> s.diff() - shape: (5,) - Series: \'s\' [i8] - [ - null - -10 - 20 - -5 - 10 - ] - - >>> s.diff(n=2) - shape: (5,) - Series: \'s\' [i8] - [ - null - null - 10 - 15 - 5 - ] - - >>> s.diff(n=2, null_behavior="drop") - shape: (3,) - Series: \'s\' [i8] - [ - 10 - 15 - 5 - ] - - ''' - def pct_change(self, n: int | IntoExprColumn = ...) -> Series: - """ - Computes percentage change between values. - - Percentage change (as fraction) between current element and most-recent - non-null element at least `n` period(s) before the current element. - - Computes the change from the previous row by default. - - Parameters - ---------- - n - periods to shift for forming percent change. - - Examples - -------- - >>> pl.Series(range(10)).pct_change() - shape: (10,) - Series: '' [f64] - [ - null - inf - 1.0 - 0.5 - 0.333333 - 0.25 - 0.2 - 0.166667 - 0.142857 - 0.125 - ] - - >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) - shape: (10,) - Series: '' [f64] - [ - null - null - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - 3.0 - ] - - """ - def skew(self) -> float | None: - """ - Compute the sample skewness of a data set. - - For normally distributed data, the skewness should be about zero. For - unimodal continuous distributions, a skewness value greater than zero means - that there is more weight in the right tail of the distribution. The - function `skewtest` can be used to determine if the skewness value - is close enough to zero, statistically speaking. - - - See scipy.stats for more information. - - Parameters - ---------- - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - Notes - ----- - The sample skewness is computed as the Fisher-Pearson coefficient - of skewness, i.e. - - .. math:: g_1=\\frac{m_3}{m_2^{3/2}} - - where - - .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i - - is the biased sample :math:`i\\texttt{th}` central moment, and - :math:`\\bar{x}` is - the sample mean. If `bias` is False, the calculations are - corrected for bias and the value computed is the adjusted - Fisher-Pearson standardized moment coefficient, i.e. - - .. math:: - G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} - - """ - def kurtosis(self) -> float | None: - """ - Compute the kurtosis (Fisher or Pearson) of a dataset. - - Kurtosis is the fourth central moment divided by the square of the - variance. If Fisher's definition is used, then 3.0 is subtracted from - the result to give 0.0 for a normal distribution. - If bias is False then the kurtosis is calculated using k statistics to - eliminate bias coming from biased moment estimators - - See scipy.stats for more information - - Parameters - ---------- - fisher : bool, optional - If True, Fisher's definition is used (normal ==> 0.0). If False, - Pearson's definition is used (normal ==> 3.0). - bias : bool, optional - If False, the calculations are corrected for statistical bias. - - """ - def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: - """ - Set values outside the given boundaries to the boundary value. - - Parameters - ---------- - lower_bound - Lower bound. Accepts expression input. - Non-expression inputs are parsed as literals. - If set to `None` (default), no lower bound is applied. - upper_bound - Upper bound. Accepts expression input. - Non-expression inputs are parsed as literals. - If set to `None` (default), no upper bound is applied. - - See Also - -------- - when - - Notes - ----- - This method only works for numeric and temporal columns. To clip other data - types, consider writing a `when-then-otherwise` expression. See :func:`when`. - - Examples - -------- - Specifying both a lower and upper bound: - - >>> s = pl.Series([-50, 5, 50, None]) - >>> s.clip(1, 10) - shape: (4,) - Series: '' [i64] - [ - 1 - 5 - 10 - null - ] - - Specifying only a single bound: - - >>> s.clip(upper_bound=10) - shape: (4,) - Series: '' [i64] - [ - -50 - 5 - 10 - null - ] - - """ - def lower_bound(self) -> Self: - ''' - Return the lower bound of this Series\' dtype as a unit Series. - - See Also - -------- - upper_bound : return the upper bound of the given Series\' dtype. - - Examples - -------- - >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) - >>> s.lower_bound() - shape: (1,) - Series: \'s\' [i32] - [ - -2147483648 - ] - - >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) - >>> s.lower_bound() - shape: (1,) - Series: \'s\' [f32] - [ - -inf - ] - - ''' - def upper_bound(self) -> Self: - ''' - Return the upper bound of this Series\' dtype as a unit Series. - - See Also - -------- - lower_bound : return the lower bound of the given Series\' dtype. - - Examples - -------- - >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) - >>> s.upper_bound() - shape: (1,) - Series: \'s\' [i8] - [ - 127 - ] - - >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) - >>> s.upper_bound() - shape: (1,) - Series: \'s\' [f64] - [ - inf - ] - - ''' - def replace(self, mapping: dict[Any, Any]) -> Self: - ''' - Replace values according to the given mapping. - - Needs a global string cache for lazily evaluated queries on columns of - type `Categorical`. - - Parameters - ---------- - mapping - Mapping of values to their replacement. - default - Value to use when the mapping does not contain the lookup value. - Defaults to keeping the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - - See Also - -------- - str.replace - - Examples - -------- - Replace a single value by another value. Values not in the mapping remain - unchanged. - - >>> s = pl.Series("a", [1, 2, 2, 3]) - >>> s.replace({2: 100}) - shape: (4,) - Series: \'a\' [i64] - [ - 1 - 100 - 100 - 3 - ] - - Replace multiple values. Specify a default to set values not in the given map - to the default value. - - >>> s = pl.Series("country_code", ["FR", "ES", "DE", None]) - >>> country_code_map = { - ... "CA": "Canada", - ... "DE": "Germany", - ... "FR": "France", - ... None: "unspecified", - ... } - >>> s.replace(country_code_map, default=None) - shape: (4,) - Series: \'country_code\' [str] - [ - "France" - null - "Germany" - "unspecified" - ] - - The return type can be overridden with the `return_dtype` argument. - - >>> s = pl.Series("a", [0, 1, 2, 3]) - >>> s.replace({1: 10, 2: 20}, default=0, return_dtype=pl.UInt8) - shape: (4,) - Series: \'a\' [u8] - [ - 0 - 10 - 20 - 0 - ] - ''' - def reshape(self, dimensions: tuple[int, ...]) -> Series: - ''' - Reshape this Series to a flat Series or a Series of Lists. - - Parameters - ---------- - dimensions - Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that - dimension is inferred. - - Returns - ------- - Series - If a single dimension is given, results in a Series of the original - data type. - If a multiple dimensions are given, results in a Series of data type - :class:`List` with shape (rows, cols). - - See Also - -------- - Series.list.explode : Explode a list column. - - Examples - -------- - >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) - >>> s.reshape((3, 3)) - shape: (3,) - Series: \'foo\' [list[i64]] - [ - [1, 2, 3] - [4, 5, 6] - [7, 8, 9] - ] - - ''' - def shuffle(self, seed: int | None = ...) -> Series: - ''' - Shuffle the contents of this Series. - - Parameters - ---------- - seed - Seed for the random number generator. If set to None (default), a - random seed is generated each time the shuffle is called. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.shuffle(seed=1) - shape: (3,) - Series: \'a\' [i64] - [ - 2 - 1 - 3 - ] - - ''' - def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: - """ - Exponentially-weighted moving average. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> s = pl.Series([1, 2, 3]) - >>> s.ewm_mean(com=1) - shape: (3,) - Series: '' [f64] - [ - 1.0 - 1.666667 - 2.428571 - ] - - """ - def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: - ''' - Exponentially-weighted moving standard deviation. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.ewm_std(com=1) - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 0.707107 - 0.963624 - ] - - ''' - def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: - ''' - Exponentially-weighted moving variance. - - Parameters - ---------- - com - Specify decay in terms of center of mass, :math:`\\gamma`, with - - .. math:: - \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 - span - Specify decay in terms of span, :math:`\\theta`, with - - .. math:: - \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 - half_life - Specify decay in terms of half-life, :math:`\\lambda`, with - - .. math:: - \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; - \\forall \\; \\lambda > 0 - alpha - Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. - adjust - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings - - - When `adjust=True` the EW function is calculated - using weights :math:`w_i = (1 - \\alpha)^i` - - When `adjust=False` the EW function is calculated - recursively by - - .. math:: - y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t - bias - When `bias=False`, apply a correction to make the estimate statistically - unbiased. - min_periods - Minimum number of observations in window required to have a value - (otherwise result is null). - ignore_nulls - Ignore missing values when calculating weights. - - - When `ignore_nulls=False` (default), weights are based on absolute - positions. - For example, the weights of :math:`x_0` and :math:`x_2` used in - calculating the final weighted average of - [:math:`x_0`, None, :math:`x_2`] are - :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and - :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. - - - When `ignore_nulls=True`, weights are based - on relative positions. For example, the weights of - :math:`x_0` and :math:`x_2` used in calculating the final weighted - average of [:math:`x_0`, None, :math:`x_2`] are - :math:`1-\\alpha` and :math:`1` if `adjust=True`, - and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.ewm_var(com=1) - shape: (3,) - Series: \'a\' [f64] - [ - 0.0 - 0.5 - 0.928571 - ] - - ''' - def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: - """ - Extremely fast method for extending the Series with 'n' copies of a value. - - Parameters - ---------- - value - A constant literal value (not an expression) with which to extend - the Series; can pass None to extend with nulls. - n - The number of additional values that will be added. - - Examples - -------- - >>> s = pl.Series([1, 2, 3]) - >>> s.extend_constant(99, n=2) - shape: (5,) - Series: '' [i64] - [ - 1 - 2 - 3 - 99 - 99 - ] - - """ - def set_sorted(self) -> Self: - ''' - Flags the Series as \'sorted\'. - - Enables downstream code to user fast paths for sorted arrays. - - Parameters - ---------- - descending - If the `Series` order is descending. - - Warnings - -------- - This can lead to incorrect results if this `Series` is not sorted!! - Use with care! - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.set_sorted().max() - 3 - - ''' - def new_from_index(self, index: int, length: int) -> Self: - """Create a new Series filled with values from the given index.""" - def shrink_dtype(self) -> Series: - """ - Shrink numeric columns to the minimal required datatype. - - Shrink to the dtype needed to fit the extrema of this [`Series`]. - This can be used to reduce memory pressure. - """ - def get_chunks(self) -> list[Series]: - """Get the chunks of this Series as a list of Series.""" - def implode(self) -> Self: - """Aggregate values into a list.""" - def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: - """ - Apply a custom/user-defined function (UDF) over elements in this Series. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Series.map_elements`. - - Parameters - ---------- - function - Custom function or lambda. - return_dtype - Output datatype. If none is given, the same datatype as this Series will be - used. - skip_nulls - Nulls will be skipped and not passed to the python function. - This is faster because python can be skipped and because we call - more specialized functions. - - """ - def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: - """ - Apply a custom rolling window function. - - .. deprecated:: 0.19.0 - This method has been renamed to :func:`Series.rolling_map`. - - Parameters - ---------- - function - Aggregation function - window_size - The length of the window. - weights - An optional slice with the same length as the window that will be multiplied - elementwise with the values in the window. - min_periods - The number of values in the window that should be non-null before computing - a result. If None, it will be set equal to window size. - center - Set the labels at the center of the window - - """ - def is_first(self) -> Series: - """ - Return a boolean mask indicating the first occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Series.is_first_distinct`. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - """ - def is_last(self) -> Series: - """ - Return a boolean mask indicating the last occurrence of each distinct value. - - .. deprecated:: 0.19.3 - This method has been renamed to :func:`Series.is_last_distinct`. - - Returns - ------- - Series - Series of data type :class:`Boolean`. - - """ - def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: - """ - Clip (limit) the values in an array to a `min` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - lower_bound - Lower bound. - - """ - def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: - """ - Clip (limit) the values in an array to a `max` boundary. - - .. deprecated:: 0.19.12 - Use :func:`clip` instead. - - Parameters - ---------- - upper_bound - Upper bound. - - """ - def shift_and_fill(self, fill_value: int | Expr) -> Series: - """ - Shift values by the given number of places and fill the resulting null values. - - .. deprecated:: 0.19.12 - Use :func:`shift` instead. - - Parameters - ---------- - fill_value - Fill None values with the result of this expression. - n - Number of places to shift (may be negative). - - """ - def is_float(self) -> bool: - ''' - Check if this Series has floating point numbers. - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_float()` instead. - - Examples - -------- - >>> s = pl.Series("a", [1.0, 2.0, 3.0]) - >>> s.is_float() # doctest: +SKIP - True - - ''' - def is_integer(self, signed: bool | None = ...) -> bool: - ''' - Check if this Series datatype is an integer (signed or unsigned). - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_integer()` instead. - For signed/unsigned variants, use `Series.dtype.is_signed_integer()` - or `Series.dtype.is_unsigned_integer()`. - - Parameters - ---------- - signed - * if `None`, both signed and unsigned integer dtypes will match. - * if `True`, only signed integer dtypes will be considered a match. - * if `False`, only unsigned integer dtypes will be considered a match. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) - >>> s.is_integer() # doctest: +SKIP - True - >>> s.is_integer(signed=False) # doctest: +SKIP - True - >>> s.is_integer(signed=True) # doctest: +SKIP - False - - ''' - def is_numeric(self) -> bool: - ''' - Check if this Series datatype is numeric. - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_float()` instead. - - Examples - -------- - >>> s = pl.Series("a", [1, 2, 3]) - >>> s.is_numeric() # doctest: +SKIP - True - - ''' - def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: - """ - Check if this Series datatype is temporal. - - .. deprecated:: 0.19.13 - Use `Series.dtype.is_temporal()` instead. - - Parameters - ---------- - excluding - Optionally exclude one or more temporal dtypes from matching. - - Examples - -------- - >>> from datetime import date - >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) - >>> s.is_temporal() # doctest: +SKIP - True - >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP - False - - """ - def is_boolean(self) -> bool: - ''' - Check if this Series is a Boolean. - - .. deprecated:: 0.19.14 - Use `Series.dtype == pl.Boolean` instead. - - Examples - -------- - >>> s = pl.Series("a", [True, False, True]) - >>> s.is_boolean() # doctest: +SKIP - True - - ''' - def is_utf8(self) -> bool: - ''' - Check if this Series datatype is a Utf8. - - .. deprecated:: 0.19.14 - Use `Series.dtype == pl.Utf8` instead. - - Examples - -------- - >>> s = pl.Series("x", ["a", "b", "c"]) - >>> s.is_utf8() # doctest: +SKIP - True - - ''' - def take_every(self, n: int) -> Series: - """ - Take every nth value in the Series and return as new Series. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather_every`. - - Parameters - ---------- - n - Gather every *n*-th row. - """ - def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: - """ - Take values by index. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`gather`. - - Parameters - ---------- - indices - Index location used for selection. - """ - def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: - """ - Set values at the index locations. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`scatter`. - - Parameters - ---------- - indices - Integers representing the index locations. - values - Replacement values. - """ - def cumsum(self) -> Series: - """ - Get an array with the cumulative sum computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_sum`. - - Parameters - ---------- - reverse - reverse the operation. - - """ - def cummax(self) -> Series: - """ - Get an array with the cumulative max computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_max`. - - Parameters - ---------- - reverse - reverse the operation. - """ - def cummin(self) -> Series: - """ - Get an array with the cumulative min computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_min`. - - Parameters - ---------- - reverse - reverse the operation. - """ - def cumprod(self) -> Series: - """ - Get an array with the cumulative product computed at every element. - - .. deprecated:: 0.19.14 - This method has been renamed to :meth:`cum_prod`. - - Parameters - ---------- - reverse - reverse the operation. - """ - def view(self) -> SeriesView: - """ - Get a view into this Series data with a numpy array. - - .. deprecated:: 0.19.14 - This method will be removed in a future version. - - This operation doesn't clone data, but does not include missing values. - Don't use this unless you know what you are doing. - - Parameters - ---------- - ignore_nulls - If True then nulls are converted to 0. - If False then an Exception is raised if nulls are present. - - """ - def map_dict(self, mapping: dict[Any, Any]) -> Self: - """ - Replace values in the Series using a remapping dictionary. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`replace`. The default behavior - has changed to keep any values not present in the mapping unchanged. - Pass `default=None` to keep existing behavior. - - Parameters - ---------- - mapping - Dictionary containing the before/after values to map. - default - Value to use when the remapping dict does not contain the lookup value. - Use `pl.first()`, to keep the original value. - return_dtype - Set return dtype to override automatic return dtype determination. - """ - def series_equal(self, other: Series) -> bool: - """ - Check whether the Series is equal to another Series. - - .. deprecated:: 0.19.16 - This method has been renamed to :meth:`equals`. - - Parameters - ---------- - other - Series to compare with. - null_equal - Consider null values as equal. - strict - Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a - `pl.Int64` will return `False`. - """ - @property - def dtype(self): ... - @property - def flags(self): ... - @property - def inner_dtype(self): ... - @property - def name(self): ... - @property - def shape(self): ... - @property - def bin(self): ... - @property - def cat(self): ... - @property - def dt(self): ... - @property - def list(self): ... - @property - def arr(self): ... - @property - def str(self): ... - @property - def struct(self): ... -def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: - """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/series/series.pyi new file mode 100644 index 0000000..504e2d8 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.3/polars/series/series.pyi @@ -0,0 +1,5265 @@ +#: version 0.20.3 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Enum as Enum, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Null as Null, Object as Object, String as String, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, hvplot as hvplot +from polars.exceptions import ModuleUpgradeRequired as ModuleUpgradeRequired, ShapeError as ShapeError +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, _warn_null_comparison as _warn_null_comparison, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, Literal, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +_HVPLOT_AVAILABLE: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_buffer_info(self) -> BufferInfo: + """ + Return pointer, offset, and length information about the underlying buffer. + + Returns + ------- + tuple of ints + Tuple of the form (pointer, offset, length) + + Raises + ------ + ComputeError + If the `Series` contains multiple chunks. + """ + def _get_buffer(self, index: Literal[0, 1, 2]) -> Self | None: + """ + Return the underlying data, validity, or offsets buffer as a Series. + + The data buffer always exists. + The validity buffer may not exist if the column contains no null values. + The offsets buffer only exists for Series of data type `String` and `List`. + + Parameters + ---------- + index + An index indicating the buffer to return: + + - `0` -> data buffer + - `1` -> validity buffer + - `2` -> offsets buffer + + Returns + ------- + Series or None + `Series` if the specified buffer exists, `None` otherwise. + + Raises + ------ + ComputeError + If the `Series` contains multiple chunks. + """ + def _from_buffer(self, dtype: PolarsDataType, buffer_info: BufferInfo, owner: Any) -> Self: + """ + Construct a Series from information about its underlying buffer. + + Parameters + ---------- + dtype + The data type of the buffer. + buffer_info + Tuple containing buffer information in the form `(pointer, offset, length)`. + owner + The object owning the buffer. + + Returns + ------- + Series + """ + def _from_buffers(self, dtype: PolarsDataType, data: Series | Sequence[Series], validity: Series | None = ...) -> Self: + """ + Construct a Series from information about its underlying buffers. + + Parameters + ---------- + dtype + The data type of the resulting Series. + data + Buffers describing the data. For most data types, this is a single Series of + the physical data type of `dtype`. Some data types require multiple buffers: + + - `String`: A data buffer of type `UInt8` and an offsets buffer + of type `Int64`. + validity + Validity buffer. If specified, must be a Series of data type `Boolean`. + + Returns + ------- + Series + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series <= other`.""" + def lt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series < other`.""" + def eq(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series == other`.""" + def eq_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series == other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def ne(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series != other`.""" + def ne_missing(self, other: Any) -> Self | Expr: + ''' + Method equivalent of equality operator `series != other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + + ''' + def ge(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series >= other`.""" + def gt(self, other: Any) -> Self | Expr: + """Method equivalent of operator expression `series > other`.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | None | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + ''' + Return the Series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to `s[0]`, with a check + that the shape is (1,). With an index, this is equivalent to `s[index]`. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cum_sum().item(-1) + 24 + + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.sqrt() + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.414214 + 1.732051 + ] + + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.cbrt() + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.259921 + 1.44225 + ] + + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + + """ + def log(self, base: float = ...) -> Series: + """ + Compute the logarithm to a given base. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.log() + shape: (3,) + Series: '' [f64] + [ + 0.0 + 0.693147 + 1.098612 + ] + """ + def log1p(self) -> Series: + """ + Compute the natural logarithm of the input array plus one, element-wise. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.log1p() + shape: (3,) + Series: '' [f64] + [ + 0.693147 + 1.098612 + 1.386294 + ] + """ + def log10(self) -> Series: + """ + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> s = pl.Series([10, 100, 1000]) + >>> s.log10() + shape: (3,) + Series: '' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + """ + def exp(self) -> Series: + """ + Compute the exponential, element-wise. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.exp() + shape: (3,) + Series: '' [f64] + [ + 2.718282 + 7.389056 + 20.085537 + ] + """ + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a Series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + Series has a numeric dtype). All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> s = pl.Series([1, 2, 3, 4, 5]) + >>> s.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + Non-numeric data types may not have all statistics available. + + >>> s = pl.Series(["a", "a", None, "b", "c"]) + >>> s.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 4 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + + ''' + def mean(self) -> int | float | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + + ''' + def product(self) -> int | float: + """Reduce this Series to the product value.""" + def pow(self, exponent: int | float | None | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + """ + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def nan_min(self) -> int | float | date | datetime | timedelta | str: + """ + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy's `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + """ + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + + ''' + def median(self) -> float | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + + ''' + def rle(self) -> Series: + ''' + Get the lengths of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Map values to run IDs. + + Similar to RLE, but it maps each value to an ID corresponding to the run into + which it falls. This is especially useful when you want to define groups by + runs of identical values rather than the values themselves. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + include_breakpoint + Include a column that indicates the upper breakpoint. + include_category + Include a column that shows the intervals as categories. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬───────┐ + │ break_point ┆ category ┆ count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═══════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴───────┘ + + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬───────┐ + │ color ┆ count │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═══════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴───────┘ + + Sort the output by count. + + >>> s.value_counts(sort=True) + shape: (3, 2) + ┌───────┬───────┐ + │ color ┆ count │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═══════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴───────┘ + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + + ''' + def cum_max(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cum_max() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + + ''' + def cum_min(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cum_min() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + + ''' + def cum_prod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_prod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + + ''' + def cum_sum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_sum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + The resulting series will consist of multiple chunks. + + Parameters + ---------- + other + Series to append. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from `append`, which adds the chunks from `other` to the chunks of + this series, `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer `append` over `extend` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single `Series`. In the latter case, finish the sequence + of `append` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + head + + """ + def gather_every(self, n: int, offset: int = ...) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Start the row count at this offset. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + >>> s.gather_every(2, offset=1) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + + ''' + def null_count(self) -> int: + """ + Count the null values in this Series. + + Examples + -------- + >>> s = pl.Series([1, None, None]) + >>> s.null_count() + 2 + """ + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no `null` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have `null` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be `false`. + + To confirm that a column has `null` values use :func:`null_count`. + + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + Examples + -------- + >>> s = pl.Series([1, 3, 2]) + >>> s.is_sorted() + False + + >>> s = pl.Series([3, 2, 1]) + >>> s.is_sorted(descending=True) + True + + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + + """ + def equals(self, other: Series) -> bool: + ''' + Check whether the Series is equal to another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + See Also + -------- + assert_series_equal + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s1.equals(s1) + True + >>> s1.equals(s2) + False + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that fall between the given start/end values. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point `nan` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set `zero_copy_only=True`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + + ''' + def _view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + + Returns + ------- + SeriesView + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s._view(ignore_nulls=True) + SeriesView([1, 0]) + + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + + ''' + def count(self) -> int: + ''' + Return the number of non-null elements in the column. + + See Also + -------- + len + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.count() + 2 + ''' + def len(self) -> int: + ''' + Return the number of elements in the Series. + + Null values count towards the total. + + See Also + -------- + count + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.len() + 3 + ''' + def set(self, filter: Series, value: int | float | str | bool | None) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def scatter(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimization (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.scatter(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_count("row_nr").select( + ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + + ''' + def clone(self) -> Self: + ''' + Create a copy of this Series. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + + ''' + def round_sig_figs(self, digits: int) -> Series: + """ + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> s = pl.Series([0.01234, 3.333, 1234.0]) + >>> s.round_sig_figs(2) + shape: (3,) + Series: '' [f64] + [ + 0.012 + 3.3 + 1200.0 + ] + + """ + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + + ''' + def cot(self) -> Series: + ''' + Compute the element-wise value for the cotangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cot() + shape: (3,) + Series: \'a\' [f64] + [ + inf + 6.1232e-17 + -8.1656e15 + ] + + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + + ''' + def shift(self, n: int = ...) -> Series: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> s = pl.Series([1, 2, 3, 4]) + >>> s.shift() + shape: (4,) + Series: '' [i64] + [ + null + 1 + 2 + 3 + ] + + Pass a negative value to shift in the opposite direction instead. + + >>> s.shift(-2) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + null + null + ] + + Specify `fill_value` to fill the resulting null values. + + >>> s.shift(-2, fill_value=100) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + 100 + 100 + ] + + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their std dev. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + ] + + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + + Examples + -------- + >>> s = pl.Series([1, -2, -3]) + >>> s.abs() + shape: (3,) + Series: '' [i64] + [ + 1 + 2 + 3 + ] + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> s = pl.Series([1, 2, 2, 4, 5]) + >>> s.skew() + 0.34776706224699483 + + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: + """ + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no lower bound is applied. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no upper bound is applied. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> s = pl.Series([-50, 5, 50, None]) + >>> s.clip(1, 10) + shape: (4,) + Series: '' [i64] + [ + 1 + 5 + 10 + null + ] + + Specifying only a single bound: + + >>> s.clip(upper_bound=10) + shape: (4,) + Series: '' [i64] + [ + -50 + 5 + 10 + null + ] + + """ + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + + ''' + def replace(self, old: IntoExpr | Sequence[Any] | Mapping[Any, Any], new: IntoExpr | Sequence[Any] | NoDefault = ...) -> Self: + ''' + Replace values by different values. + + Parameters + ---------- + old + Value or sequence of values to replace. + Also accepts a mapping of values to their replacement as syntactic sugar for + `replace(new=Series(mapping.keys()), old=Series(mapping.values()))`. + new + Value or sequence of values to replace by. + Length must match the length of `old` or have length 1. + default + Set values that were not replaced to this value. + Defaults to keeping the original value. + Accepts expression input. Non-expression inputs are parsed as literals. + return_dtype + The data type of the resulting Series. If set to `None` (default), + the data type is determined automatically based on the other inputs. + + See Also + -------- + str.replace + + Notes + ----- + The global string cache must be enabled when replacing categorical values. + + Examples + -------- + Replace a single value by another value. Values that were not replaced remain + unchanged. + + >>> s = pl.Series([1, 2, 2, 3]) + >>> s.replace(2, 100) + shape: (4,) + Series: \'\' [i64] + [ + 1 + 100 + 100 + 3 + ] + + Replace multiple values by passing sequences to the `old` and `new` parameters. + + >>> s.replace([2, 3], [100, 200]) + shape: (4,) + Series: \'\' [i64] + [ + 1 + 100 + 100 + 200 + ] + + Passing a mapping with replacements is also supported as syntactic sugar. + Specify a default to set all values that were not matched. + + >>> mapping = {2: 100, 3: 200} + >>> s.replace(mapping, default=-1) + shape: (4,) + Series: \'\' [i64] + [ + -1 + 100 + 100 + 200 + ] + + + The default can be another Series. + + >>> default = pl.Series([2.5, 5.0, 7.5, 10.0]) + >>> s.replace(2, 100, default=default) + shape: (4,) + Series: \'\' [f64] + [ + 2.5 + 100.0 + 100.0 + 10.0 + ] + + Replacing by values of a different data type sets the return type based on + a combination of the `new` data type and either the original data type or the + default data type if it was set. + + >>> s = pl.Series(["x", "y", "z"]) + >>> mapping = {"x": 1, "y": 2, "z": 3} + >>> s.replace(mapping) + shape: (3,) + Series: \'\' [str] + [ + "1" + "2" + "3" + ] + >>> s.replace(mapping, default=None) + shape: (3,) + Series: \'\' [i64] + [ + 1 + 2 + 3 + ] + + Set the `return_dtype` parameter to control the resulting data type directly. + + >>> s.replace(mapping, return_dtype=pl.UInt8) + shape: (3,) + Series: \'\' [u8] + [ + 1 + 2 + 3 + ] + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.ewm_mean(com=1) + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.666667 + 2.428571 + ] + + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + """Aggregate values into a list.""" + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + + """ + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() # doctest: +SKIP + True + + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_integer()` instead. + For signed/unsigned variants, use `Series.dtype.is_signed_integer()` + or `Series.dtype.is_unsigned_integer()`. + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() # doctest: +SKIP + True + >>> s.is_integer(signed=False) # doctest: +SKIP + True + >>> s.is_integer(signed=True) # doctest: +SKIP + False + + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_numeric()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() # doctest: +SKIP + True + + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_temporal()` instead. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() # doctest: +SKIP + True + >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP + False + + """ + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Boolean` instead. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() # doctest: +SKIP + True + + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a String. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.String` instead. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() # doctest: +SKIP + True + + ''' + def take_every(self, n: int, offset: int = ...) -> Series: + """ + Take every nth value in the Series and return as new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + Index location used for selection. + """ + def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + """ + Set values at the index locations. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`scatter`. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + """ + def cumsum(self) -> Series: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + reverse the operation. + + """ + def cummax(self) -> Series: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummin(self) -> Series: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cumprod(self) -> Series: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def view(self) -> SeriesView: + """ + Get a view into this Series data with a numpy array. + + .. deprecated:: 0.19.14 + This method will be removed in a future version. + + This operation doesn't clone data, but does not include missing values. + Don't use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in the Series using a remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + """ + def series_equal(self, other: Series) -> bool: + """ + Check whether the Series is equal to another Series. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`equals`. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... + @property + def plot(self): ... +def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: + """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.4/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.4/polars/dataframe/frame.pyi new file mode 100644 index 0000000..23b6274 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.4/polars/dataframe/frame.pyi @@ -0,0 +1,7074 @@ +#: version 0.20.4 +import P +import deltalake +import deltalake.table +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Enum as Enum, Float64 as Float64, Null as Null, Object as Object, String as String, Unknown as Unknown +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, hvplot as hvplot +from polars.exceptions import ModuleUpgradeRequired as ModuleUpgradeRequired, NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, frame_to_pydf as frame_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_parameter_as_positional as deprecate_parameter_as_positional, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _prepare_row_index_args as _prepare_row_index_args, _process_null_values as _process_null_values, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes, warn_null_comparison as warn_null_comparison +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, IO, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_HVPLOT_AVAILABLE: bool +_PANDAS_AVAILABLE: bool +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + """ + @classmethod + def _read_csv(cls, source: str | Path | IO[bytes] | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use `pl.read_csv` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + """ + @classmethod + def _read_parquet(cls, source: str | Path | IO[bytes] | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use `pl.read_parquet` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading `n_rows`. + """ + @classmethod + def _read_ipc(cls, source: str | Path | IO[bytes] | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading `n_rows`. + row_index_name + Row index name. + row_index_offset + Row index offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | IO[bytes] | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading `n_rows`. + row_index_name + Row index name. + row_index_offset + Row index offset. + rechunk + Make sure that all data is contiguous. + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use `pl.read_json` to dispatch to this method. + + See Also + -------- + polars.io.read_json + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use `pl.read_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with `NaN`. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to `True` will raise a `NotImplementedError`. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars DataFrame to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the DataFrame as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to `df[0,0]`, with a check that + the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + ''' + def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are Series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + `structured` is set to `False` and the DataFrame dtypes allow for a + global dtype for all columns. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + function for the conversion to numpy if necessary. + + Notes + ----- + If you\'re attempting to convert String or Decimal to an array, you\'ll need to + install `pyarrow`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.String), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.String), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + separator or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path or writeable file-like object to which the data will be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + name + Schema name. Defaults to empty string. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open `xlsxwriter.Workbook` object that has not been closed. + If None, writes to a `dataframe.xlsx` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of `{"key":value,}` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. + column_formats : dict + A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. + dtype_formats : dict + A `{dtype:str,}` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + `column_formats` param). It is also valid to use dtype groups such as + `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid `xlsxwriter` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all `xlsxwriter` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A `{key:value,}` dictionary of `xlsxwriter` format options to apply + to the table header row, such as `{"bold":True, "font_color":"#702963"}`. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a `{colname:funcname,}` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A `{colname:int,}` or `{selector:int,}` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a `{colname:columns,}` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or `{row_index:int,}` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that `row_index` starts at zero and will be + the header row (unless `include_header` is False). + sparklines : dict + A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an `xlsxwriter`-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + include_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible `xlsxwriter` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic DataFrame: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC data will be + written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC record batch data will + be written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + data_page_size + Size of the data page in bytes. Defaults to 1024^2 bytes. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to `pyarrow.parquet.write_table`. + + If you pass `partition_cols` here, the dataset will be written + using `pyarrow.parquet.write_to_dataset`. + The `partition_cols` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + ''' + def write_database(self, table_name: str, connection: str) -> int: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Schema-qualified name of the table to create or append to in the target + SQL database. If your table name contains special characters, it should + be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_table_exists : {\'append\', \'replace\', \'fail\'} + The insert mode: + + * \'replace\' will create a new database table, overwriting an existing one. + * \'append\' will append to an existing table. + * \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine to use for writing frame data. + + Returns + ------- + int + The number of rows affected, if the driver provides this information. + Otherwise, returns -1. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> deltalake.table.TableMerger | None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\', \'merge\'} + How to handle existing data. + + - If \'error\', throw an error if the table already exists (default). + - If \'append\', will add new data. + - If \'overwrite\', will replace table with new data. + - If \'ignore\', will not write anything if table already exists. + - If \'merge\', return a `TableMerger` object to merge data from the DataFrame + with the existing data. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + - See a list of supported storage options for S3 `here `__. + - See a list of supported storage options for GCS `here `__. + - See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + delta_merge_options + Keyword arguments which are required to `MERGE` a Delta lake Table. + See a list of supported merge options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + TableNotFoundError + If the delta table doesn\'t exist and MERGE action is triggered + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a DataFrame as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + Merge the DataFrame with an existing Delta Lake table. + For all `TableMerger` methods, check the deltalake docs + `here `__. + + Schema evolution is not yet supported in by the `deltalake` package, therefore + `overwrite_schema` will not have any effect on a merge operation. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> ( + ... df.write_delta( + ... "table_path", + ... mode="merge", + ... delta_merge_options={ + ... "predicate": "s.foo = t.foo", + ... "source_alias": "s", + ... "target_alias": "t", + ... }, + ... ) + ... .when_matched_update_all() + ... .when_not_matched_insert_all() + ... .execute() + ... ) # doctest: +SKIP + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.String)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 4 ┆ 5 ┆ 6 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["x", "y", "z"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["x", "y", "z"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 4 ┆ 5 ┆ 6 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["i", "j", "k"], a=[1, 2, 3], b=[4, 5, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ i ┆ j ┆ k │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ i ┆ j ┆ k │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 4 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + ''' + def insert_column(self, index: int, column: Series) -> Self: + ''' + Insert a Series at a certain column index. + + This operation is in place. + + Parameters + ---------- + index + Index at which to insert the new `Series` column. + column + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_column(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_column(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: + ''' + Filter the rows in the DataFrame based on one or more predicate expressions. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression(s) that evaluates to a boolean Series. + constraints + Column filters; use `name = value` to filter columns by the supplied value. + Each constraint will behave the same as `pl.col(name).eq(value)`, and + will be implicitly joined with the other filter conditions using `&`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") > 1) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions, combined with and/or operators: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> df.filter( + ... pl.col("foo") <= 2, + ... ~pl.col("ham").is_in(["b", "c"]), + ... ) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> df.filter(foo=2, ham="b") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Warnings + -------- + We will never guarantee the output of describe to be stable. + It will show statistics that we deem informative and may + be updated in the future. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "float": [1.0, 2.8, 3.0], + ... "int": [4, 5, None], + ... "bool": [True, False, True], + ... "str": [None, "b", "c"], + ... "str2": ["usd", "eur", None], + ... "date": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬───────┬──────┬──────┬────────────┐ + │ describe ┆ float ┆ int ┆ bool ┆ str ┆ str2 ┆ date │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ str ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪═══════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3 ┆ 2 ┆ 2 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ null ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ False ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 2.8 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ True ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴───────┴──────┴──────┴────────────┘ + ''' + def get_column_index(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.get_column_index("ham") + 2 + ''' + def replace_column(self, index: int, column: Series) -> Self: + ''' + Replace a column at an index location. + + This operation is in place. + + Parameters + ---------- + index + Column index. + column + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_column(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If `descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the `k` smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If `descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the `k` largest. Bottom-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + ''' + def equals(self, other: DataFrame) -> bool: + ''' + Check whether the DataFrame is equal to another DataFrame. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + See Also + -------- + assert_frame_equal + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.equals(df1) + True + >>> df1.equals(df2) + False + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + head + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + ''' + def with_row_index(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a row index as the first column in the DataFrame. + + Parameters + ---------- + name + Name of the index column. + offset + Start the index at this offset. Cannot be negative. + + Notes + ----- + The resulting column does not have any special properties. It is a regular + column of type `UInt32` (or `UInt64` in `polars-u64-idx`). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_index() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ index ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞═══════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └───────┴─────┴─────┘ + >>> df.with_row_index("id", offset=1000) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ id ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1000 ┆ 1 ┆ 2 │ + │ 1001 ┆ 3 ┆ 4 │ + │ 1002 ┆ 5 ┆ 6 │ + └──────┴─────┴─────┘ + + An index column can also be created using the expressions :func:`int_range` + and :func:`count`. + + >>> df.select( + ... pl.int_range(pl.count(), dtype=pl.UInt32).alias("index"), + ... pl.all(), + ... ) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ index ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞═══════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └───────┴─────┴─────┘ + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + .. deprecated:: + Use `meth`:with_row_index` instead. + Note that the default column name has changed from \'row_nr\' to \'index\'. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() # doctest: +SKIP + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The `GroupBy` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `group_by_dynamic` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling operation on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + Time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + Interval will start \'every\' duration. + offset + Change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group. + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\', \'outer_coalesce\'} + Join strategy. + + * *inner* + Returns rows that have matching values in both tables + * *left* + Returns all rows from the left table, and the matched rows from the + right table + * *outer* + Returns all rows when there is a match in either left or right table + * *outer_coalesce* + Same as \'outer\', but coalesces the key columns + * *cross* + Returns the cartisian product of rows from both tables + * *semi* + Filter rows that have a match in the right table. + * *anti* + Filter rows that not have a match in the right table. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + join_nulls + Join on null values. By default null values will never produce matches. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 5) + ┌──────┬──────┬──────┬───────┬───────────┐ + │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞══════╪══════╪══════╪═══════╪═══════════╡ + │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │ + │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │ + │ null ┆ null ┆ null ┆ z ┆ d │ + │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │ + └──────┴──────┴──────┴───────┴───────────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see `pl.StringCache()`. + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: `udf(row)`. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level `apply` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level `apply` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + ''' + def vstack(self, other: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of + this `DataFrame`, `extend` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer `vstack` over `extend` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single `DataFrame`. In the latter case, finish the sequence of + `vstack` operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + ''' + def drop(self, *columns: ColumnNameOrSelector | Iterable[ColumnNameOrSelector]) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + *columns + Names of the columns that should be removed from the dataframe. + Accepts column selector input. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.String).to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.String}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + ''' + def clone(self) -> Self: + ''' + Create a copy of this DataFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Returns + ------- + Series + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill `value`. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or String datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'first\', \'sum\', \'max\', \'min\', \'mean\', \'median\', \'last\', \'count\'} + - An expression to do the aggregation. + + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> df.melt(id_vars="a", value_vars=cs.numeric()) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to `None` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are tuples of + the distinct group values that identify each group. If a single string + was passed to `by`, the keys are a single value instead of a tuple. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying `as_dict=True`. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {(\'a\',): shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + (\'b\',): shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + (\'c\',): shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + ''' + def shift(self, n: int = ...) -> DataFrame: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> df.shift() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.shift(-2) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.shift(-2, fill_value=100) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ) + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this DataFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ) + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + See Also + -------- + with_columns + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + ''' + def max(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`max_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + ''' + def max_horizontal(self) -> Series: + ''' + Get the maximum value horizontally across columns. + + Returns + ------- + Series + A Series named `"max"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.max_horizontal() + shape: (3,) + Series: \'max\' [f64] + [ + 4.0 + 5.0 + 6.0 + ] + ''' + def min(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`min_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + ''' + def min_horizontal(self) -> Series: + ''' + Get the minimum value horizontally across columns. + + Returns + ------- + Series + A Series named `"min"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.min_horizontal() + shape: (3,) + Series: \'min\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`sum_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + ''' + def sum_horizontal(self) -> Series: + ''' + Sum all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"sum"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.sum_horizontal() + shape: (3,) + Series: \'sum\' [f64] + [ + 5.0 + 7.0 + 9.0 + ] + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`mean_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + ''' + def mean_horizontal(self) -> Series: + ''' + Take the mean of all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"mean"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.mean_horizontal() + shape: (3,) + Series: \'mean\' [f64] + [ + 2.5 + 3.5 + 4.5 + ] + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to `None` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the `DataFrame` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + - Int8 + String = String + - Float32 + Int64 = Float32 + - Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The `index` and `by_predicate` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using `by_predicate` it is an error condition if anything other than + one row is returned; more than one row raises `TooManyRowsReturnedError`, and + zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of `iter_rows()` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify `named=True` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use `by_predicate` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using `iter_rows` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materializing all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialize all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialize all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + ''' + def iter_columns(self) -> Iterator[Series]: + ''' + Returns an iterator over the DataFrame\'s columns. + + Notes + ----- + Consider whether you can use :func:`all` instead. + If you can, it will be more efficient. + + Returns + ------- + Iterator of Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [s.name for s in df.iter_columns()] + [\'a\', \'b\'] + + If you\'re using this to modify a dataframe\'s columns, e.g. + + >>> # Do NOT do this + >>> pl.DataFrame(column * 2 for column in df.iter_columns()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + + then consider whether you can use :func:`all` instead: + + >>> df.select(pl.all() * 2) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + """ + def gather_every(self, n: int, offset: int = ...) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.gather_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + >>> s.gather_every(2, offset=1) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash_rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + ''' + def to_struct(self, name: str = ...) -> Series: + ''' + Convert a `DataFrame` to a `Series` of type `Struct`. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy `corrcoef` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy `corrcoef`. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the values in `other`. + + .. warning:: + This functionality is experimental and may change without it being + considered a breaking change. + + By default, null values in the right frame are ignored. Use + `include_nulls=False` to overwrite values in this frame with + null values in the other frame. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. If set to `None` (default), + the implicit row index of each frame is used as a join key. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce + when `include_nulls = False` + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> df.update(new_df, how="inner") + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update( + ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + ''' + def count(self) -> DataFrame: + ''' + Return the number of non-null elements for each column. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"a": [1, 2, 3, 4], "b": [1, 2, 1, None], "c": [None, None, None, None]} + ... ) + >>> df.count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 3 ┆ 0 │ + └─────┴─────┴─────┘ + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + """ + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + """ + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with this value. + n + Number of places to shift (may be negative). + """ + def take_every(self, n: int, offset: int = ...) -> DataFrame: + """ + Take every nth row in the DataFrame and return as a new DataFrame. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + def find_idx_by_name(self, name: str) -> int: + """ + Find the index of a column by name. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`get_column_index`. + + Parameters + ---------- + name + Name of the column to find. + """ + def insert_at_idx(self, index: int, column: Series) -> Self: + """ + Insert a Series at a certain column index. This operation is in place. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`insert_column`. + + Parameters + ---------- + index + Column to insert the new `Series` column. + column + `Series` to insert. + """ + def replace_at_idx(self, index: int, new_column: Series) -> Self: + """ + Replace a column at an index location. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`replace_column`. + + Parameters + ---------- + index + Column index. + new_column + Series that will replace the column. + """ + def frame_equal(self, other: DataFrame) -> bool: + """ + Check whether the DataFrame is equal to another DataFrame. + + .. deprecated:: 0.19.16 + This method has been renamed to :func:`equals`. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + """ + @property + def plot(self): ... + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.4/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.4/polars/expr/expr.pyi new file mode 100644 index 0000000..b0c2b0c --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.4/polars/expr/expr.pyi @@ -0,0 +1,8338 @@ +#: version 0.20.4 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Int64 as Int64 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions, parse_predicates_constraints_as_expression as parse_predicates_constraints_as_expression +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import no_default as no_default, sphinx_accessor as sphinx_accessor, warn_null_comparison as warn_null_comparison +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + class _map_batches_wrapper: + def __init__(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None) -> None: ... + def __call__(self, *args: Any, **kwargs: Any) -> Any: ... + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: IntoExpr) -> Self: ... + def __radd__(self, other: IntoExpr) -> Self: ... + def __and__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __rand__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __eq__(self, other: IntoExpr) -> Self: ... + def __floordiv__(self, other: IntoExpr) -> Self: ... + def __rfloordiv__(self, other: IntoExpr) -> Self: ... + def __ge__(self, other: IntoExpr) -> Self: ... + def __gt__(self, other: IntoExpr) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: IntoExpr) -> Self: ... + def __lt__(self, other: IntoExpr) -> Self: ... + def __mod__(self, other: IntoExpr) -> Self: ... + def __rmod__(self, other: IntoExpr) -> Self: ... + def __mul__(self, other: IntoExpr) -> Self: ... + def __rmul__(self, other: IntoExpr) -> Self: ... + def __ne__(self, other: IntoExpr) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __ror__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, exponent: IntoExprColumn | int | float) -> Self: ... + def __rpow__(self, base: IntoExprColumn | int | float) -> Expr: ... + def __sub__(self, other: IntoExpr) -> Self: ... + def __rsub__(self, other: IntoExpr) -> Self: ... + def __truediv__(self, other: IntoExpr) -> Self: ... + def __rtruediv__(self, other: IntoExpr) -> Self: ... + def __xor__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __rxor__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + ''' + def any(self) -> Self: + ''' + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + ''' + def all(self) -> Self: + ''' + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.map`. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + keep_name + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.prefix`. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.suffix`. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.keep`. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).name.keep()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with `^` and end with `$`. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns( + ... pl.all().is_not_null().name.suffix("_not_null") # nan != null + ... ) + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + ''' + def count(self) -> Self: + ''' + Return the number of non-null elements in the column. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + See Also + -------- + len + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 2 │ + └─────┴─────┘ + ''' + def len(self) -> Self: + ''' + Return the number of elements in the column. + + Null values count towards the total. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + See Also + -------- + count + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + ''' + def cum_sum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_sum().alias("cum_sum"), + ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_sum ┆ cum_sum_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 10 │ + │ 2 ┆ 3 ┆ 9 │ + │ 3 ┆ 6 ┆ 7 │ + │ 4 ┆ 10 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_sum().alias("value_cum_sum"), + ... pl.col("values") + ... .cum_sum() + ... .forward_fill() + ... .alias("value_cum_sum_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬───────────────┬──────────────────────────┐ + │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═══════════════╪══════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴───────────────┴──────────────────────────┘ + ''' + def cum_prod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_prod().alias("cum_prod"), + ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), + ... ) + shape: (4, 3) + ┌─────┬──────────┬──────────────────┐ + │ a ┆ cum_prod ┆ cum_prod_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════════╪══════════════════╡ + │ 1 ┆ 1 ┆ 24 │ + │ 2 ┆ 2 ┆ 24 │ + │ 3 ┆ 6 ┆ 12 │ + │ 4 ┆ 24 ┆ 4 │ + └─────┴──────────┴──────────────────┘ + ''' + def cum_min(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_min().alias("cum_min"), + ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_min ┆ cum_min_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 1 ┆ 3 │ + │ 4 ┆ 1 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + ''' + def cum_max(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_max().alias("cum_max"), + ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_max ┆ cum_max_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 2 ┆ 4 │ + │ 3 ┆ 3 ┆ 4 │ + │ 4 ┆ 4 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_max().alias("cum_max"), + ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬─────────┬────────────────────┐ + │ values ┆ cum_max ┆ cum_max_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════════╪════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴─────────┴────────────────────┘ + ''' + def cum_count(self) -> Self: + ''' + Return the cumulative count of the non-null values in the column. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": ["x", "k", None, "d"]}) + >>> df.with_columns( + ... pl.col("a").cum_count().alias("cum_count"), + ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), + ... ) + shape: (4, 3) + ┌──────┬───────────┬───────────────────┐ + │ a ┆ cum_count ┆ cum_count_reverse │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u32 ┆ u32 │ + ╞══════╪═══════════╪═══════════════════╡ + │ x ┆ 1 ┆ 3 │ + │ k ┆ 2 ┆ 2 │ + │ null ┆ 2 ┆ 1 │ + │ d ┆ 3 ┆ 1 │ + └──────┴───────────┴───────────────────┘ + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + ''' + def round_sig_figs(self, digits: int) -> Self: + ''' + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) + >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) + shape: (3, 2) + ┌─────────┬────────────────┐ + │ a ┆ round_sig_figs │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════════╪════════════════╡ + │ 0.01234 ┆ 0.012 │ + │ 3.333 ┆ 3.3 │ + │ 1234.0 ┆ 1200.0 │ + └─────────┴────────────────┘ + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + See Also + -------- + Expr.get : Take a single value + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg( + ... pl.col("value").gather([2, 1]) + ... ) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ one ┆ [2, 98] │ + │ two ┆ [4, 99] │ + └───────┴───────────┘ + ''' + def get(self, index: int | Expr) -> Self: + ''' + Return a single value by index. + + Parameters + ---------- + index + An expression that leads to a UInt32 index. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns(shift=pl.col("a").shift()) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴───────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.with_columns(shift=pl.col("a").shift(-2)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ null │ + │ 4 ┆ null │ + └─────┴───────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ 100 │ + │ 4 ┆ 100 │ + └─────┴───────┘ + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().name.suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 2 ┆ 0 │ + └─────┴─────┘ + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns( + ... pl.col("c").max().over("a").name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns( + ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns( + ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns( + ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + ''' + def rolling(self, index_column: str) -> Self: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + ''' + def rle(self) -> Self: + ''' + Get the lengths and values of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + See Also + -------- + rle_id + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Get a distinct integer ID for each run of identical values. + + The ID increases by one each time the value of a column (which can be a + :class:`Struct`) changes. + + This is especially useful when you want to define a new group for every time a + column\'s value changes, rather than for every distinct value of that column. + + See Also + -------- + rle + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn], **constraints: Any) -> Self: + ''' + Filter the expression based on one or more predicate expressions. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicates + Expression(s) that evaluates to a boolean Series. + constraints + Column filters; use `name = value` to filter columns by the supplied value. + Each constraint will behave the same as `pl.col(name).eq(value)`, and + will be implicitly joined with the other filter conditions using `&`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), + ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + Filter expressions can also take constraints as keyword arguments. + + >>> import polars.selectors as cs + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "a", "a", "a", "b", "b", "b", "b", "b"], + ... "n": [1, 2, 2, 3, 1, 3, 3, 2, 3], + ... }, + ... ) + >>> df.group_by("key").agg( + ... n_1=pl.col("n").filter(n=1).sum(), + ... n_2=pl.col("n").filter(n=2).sum(), + ... n_3=pl.col("n").filter(n=3).sum(), + ... ).sort(by="key") + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ key ┆ n_1 ┆ n_2 ┆ n_3 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 4 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 9 │ + └─────┴─────┴─────┴─────┘ + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + .. deprecated:: 0.20.4 + Use :func:`filter` instead. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( # doctest: +SKIP + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series (or a NumPy array, in which + case it will be automatically converted into a Series). If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for `map` functions is transforming the values + represented by an expression using a third-party library. + + .. warning:: + If you are looking to map a function over a window function or group_by + context, refer to :func:`map_elements` instead. + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + is_elementwise + If set to true this can run in the streaming engine, but may yield + incorrect results in group-by. Ensure you know what you are doing! + agg_list + Aggregate the values of the expression into a list before applying the + function. This parameter only works in a group-by context. + The function will be invoked only once on a list of groups, rather than + once per group. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_elements + replace + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + In a group-by context, the `agg_list` parameter can improve performance if used + correctly. The following example has `agg_list` set to `False`, which causes + the function to be applied once per group. The input of the function is a + Series of type `Int64`. This is less efficient. + + >>> df = pl.DataFrame( + ... { + ... "a": [0, 1, 0, 1], + ... "b": [1, 2, 3, 4], + ... } + ... ) + >>> df.group_by("a").agg( + ... pl.col("b").map_batches(lambda x: x.max(), agg_list=False) + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ list[i64] │ + ╞═════╪═══════════╡ + │ 1 ┆ [4] │ + │ 0 ┆ [3] │ + └─────┴───────────┘ + + Using `agg_list=True` would be more efficient. In this example, the input of + the function is a Series of type `List(Int64)`. + + >>> df.group_by("a").agg( + ... pl.col("b").map_batches(lambda x: x.list.max(), agg_list=True) + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴─────┘ + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type `Callable[[Any], Any]`. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type `Callable[[Series], Any]`. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be `pl.Unknown`. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using `map_elements` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using `over` is considered a GroupBy context + here, so `map_elements` can be used to map functions over window groups. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using `over` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort("key") # doctest: +IGNORE_RESULT + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + ''' + def gather_every(self, n: int, offset: int = ...) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").gather_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + >>> df.select(pl.col("foo").gather_every(3, offset=1)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 5 │ + │ 8 │ + └─────┘ + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator `expr & other & ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator `expr | other | ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other` where `None == None`. + + This differs from default `eq` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator `expr >= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ true │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator `expr > other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator `expr <= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator `expr < other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator `expr != other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr != other` where `None == None`. + + This differs from default `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator `expr + other`. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator `expr // other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator `expr % other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator `expr * other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator `expr - other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator `expr / other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + ''' + def pow(self, exponent: IntoExprColumn | int | float) -> Self: + ''' + Method equivalent of exponentiation operator `expr ** exponent`. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator `expr ^ other`. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) + shape: (3, 3) + ┌───────────┬──────────────────┬──────────┐ + │ sets ┆ optional_members ┆ contains │ + │ --- ┆ --- ┆ --- │ + │ list[i64] ┆ i64 ┆ bool │ + ╞═══════════╪══════════════════╪══════════╡ + │ [1, 2, 3] ┆ 1 ┆ true │ + │ [1, 2] ┆ 2 ┆ true │ + │ [9, 10] ┆ 3 ┆ false │ + └───────────┴──────────────────┴──────────┘ + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given lower and upper bounds. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with `lit` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ NaN │ + │ 3.0 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ … ┆ … │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("index").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └───────┴─────────────────────┴─────────────────┘ + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("index").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("index").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └───────┴─────────────────────┴─────────────────┘ + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("index").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬──────────────────┐ + │ index ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └───────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("index").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬──────────────────┐ + │ index ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └───────┴─────────────────────┴──────────────────┘ + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("index").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("index").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └───────┴─────────────────────┴─────────────────┘ + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("index").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("index").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └───────┴─────────────────────┴─────────────────┘ + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("index").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("index").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └───────┴─────────────────────┴─────────────────┘ + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: + ''' + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) + >>> df.with_columns(clip=pl.col("a").clip(1, 10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + Specifying only a single bound: + + >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + ''' + def cot(self) -> Self: + ''' + Compute the element-wise value for the cotangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cot().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 0.64 │ + └──────┘ + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + ''' + def sample(self, n: int | IntoExprColumn | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + ''' + def ewm_mean(self) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + ''' + def ewm_std(self) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + ''' + def ewm_var(self) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + ''' + def value_counts(self) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + ''' + def hist(self, bins: IntoExpr | None = ...) -> Self: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + include_breakpoint + Include a column that indicates the upper breakpoint. + include_category + Include a column that shows the intervals as categories. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 3, 8, 8, 2, 1, 3]}) + >>> df.select(pl.col("a").hist(bins=[1, 2, 3])) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 2 │ + │ 2 │ + └─────┘ + >>> df.select( + ... pl.col("a").hist( + ... bins=[1, 2, 3], include_breakpoint=True, include_category=True + ... ) + ... ) + shape: (4, 1) + ┌───────────────────────┐ + │ a │ + │ --- │ + │ struct[3] │ + ╞═══════════════════════╡ + │ {1.0,"(-inf, 1.0]",2} │ + │ {2.0,"(1.0, 2.0]",1} │ + │ {3.0,"(2.0, 3.0]",2} │ + │ {inf,"(3.0, inf]",2} │ + └───────────────────────┘ + ''' + def replace(self, old: IntoExpr | Sequence[Any] | Mapping[Any, Any], new: IntoExpr | Sequence[Any] | NoDefault = ...) -> Self: + ''' + Replace values by different values. + + Parameters + ---------- + old + Value or sequence of values to replace. + Accepts expression input. Sequences are parsed as Series, + other non-expression inputs are parsed as literals. + Also accepts a mapping of values to their replacement as syntactic sugar for + `replace(new=Series(mapping.keys()), old=Series(mapping.values()))`. + new + Value or sequence of values to replace by. + Accepts expression input. Sequences are parsed as Series, + other non-expression inputs are parsed as literals. + Length must match the length of `old` or have length 1. + default + Set values that were not replaced to this value. + Defaults to keeping the original value. + Accepts expression input. Non-expression inputs are parsed as literals. + return_dtype + The data type of the resulting expression. If set to `None` (default), + the data type is determined automatically based on the other inputs. + + See Also + -------- + str.replace + + Notes + ----- + The global string cache must be enabled when replacing categorical values. + + Examples + -------- + Replace a single value by another value. Values that were not replaced remain + unchanged. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) + >>> df.with_columns(replaced=pl.col("a").replace(2, 100)) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 3 │ + └─────┴──────────┘ + + Replace multiple values by passing sequences to the `old` and `new` parameters. + + >>> df.with_columns(replaced=pl.col("a").replace([2, 3], [100, 200])) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 200 │ + └─────┴──────────┘ + + Passing a mapping with replacements is also supported as syntactic sugar. + Specify a default to set all values that were not matched. + + >>> mapping = {2: 100, 3: 200} + >>> df.with_columns(replaced=pl.col("a").replace(mapping, default=-1)) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ -1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 200 │ + └─────┴──────────┘ + + Replacing by values of a different data type sets the return type based on + a combination of the `new` data type and either the original data type or the + default data type if it was set. + + >>> df = pl.DataFrame({"a": ["x", "y", "z"]}) + >>> mapping = {"x": 1, "y": 2, "z": 3} + >>> df.with_columns(replaced=pl.col("a").replace(mapping)) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + >>> df.with_columns(replaced=pl.col("a").replace(mapping, default=None)) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + + Set the `return_dtype` parameter to control the resulting data type directly. + + >>> df.with_columns( + ... replaced=pl.col("a").replace(mapping, return_dtype=pl.UInt8) + ... ) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ u8 │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + + Expression input is supported for all parameters. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3], "b": [1.5, 2.5, 5.0, 1.0]}) + >>> df.with_columns( + ... replaced=pl.col("a").replace( + ... old=pl.col("a").max(), + ... new=pl.col("b").sum(), + ... default=pl.col("b"), + ... ) + ... ) + shape: (4, 3) + ┌─────┬─────┬──────────┐ + │ a ┆ b ┆ replaced │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═════╪══════════╡ + │ 1 ┆ 1.5 ┆ 1.5 │ + │ 2 ┆ 2.5 ┆ 2.5 │ + │ 2 ┆ 5.0 ┆ 5.0 │ + │ 3 ┆ 1.0 ┆ 10.0 │ + └─────┴─────┴──────────┘ + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + """ + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + `polars.Unknown`. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + """ + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + """ + def register_plugin(self) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by `lib::symbol`. + + The parameters you give dictate how polars will deal + with the function. Make sure they are correct! + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + These arguments have to be of type Expression. + kwargs + Non-expression arguments. They must be JSON serializable. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + returns_scalar + Automatically explode on unit length if it ran as final aggregation. + this is the case for aggregations like `sum`, `min`, `covariance` etc. + cast_to_supertypes + Cast the input datatypes to their supertype. + pass_name_to_apply + if set, then the `Series` passed to the function in the group_by operation + will ensure the name is set. This is an extra heap allocation per group. + changes_length + For example a `unique` or a `slice` + """ + def _register_plugin(self) -> Self: ... + def take_every(self, n: int, offset: int = ...) -> Self: + """ + Take every nth value in the Series and return as a new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + """ + def cumsum(self) -> Self: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumprod(self) -> Self: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummin(self) -> Self: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummax(self) -> Self: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumcount(self) -> Self: + """ + Get an array with the cumulative count computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_count`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in column according to remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def name(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.4/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.4/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..45d2353 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.4/polars/lazyframe/frame.pyi @@ -0,0 +1,4211 @@ +#: version 0.20.4 +import P +import np +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, String as String, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.lazyframe.in_process import InProcessQuery as InProcessQuery +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_parameter_as_positional as deprecate_parameter_as_positional, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_index_args as _prepare_row_index_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use `pl.scan_csv` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + """ + @classmethod + def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use `pl.scan_parquet` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + """ + @classmethod + def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use `pl.scan_ipc` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + """ + @classmethod + def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use `pl.scan_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to `True`. + If this is set to `True` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and pass on + the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the LazyFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If `descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the `k` smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If `descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the `k` largest. Bottom-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + ''' + def collect(self) -> DataFrame | InProcessQuery: + ''' + Materialize this LazyFrame into a DataFrame. + + By default, all query optimizations are enabled. Individual optimizations may + be disabled by setting the corresponding parameter to `False`. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + no_optimization + Turn off (certain) optimizations. + streaming + Process the query in batches to handle larger-than-memory data. + If set to `False` (default), the entire query is processed in a single + batch. + + .. warning:: + This functionality is currently in an alpha state. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + background + Run the query in the background and get a handle to the query. + This handle can be used to fetch the result or cancel the query. + + Returns + ------- + DataFrame + + See Also + -------- + fetch: Run the query on the first `n` rows only for debugging purposes. + explain : Print the query plan that is evaluated with collect. + profile : Collect the LazyFrame and time each node in the computation graph. + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.Config.set_streaming_chunk_size : Set the size of streaming batches. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + Collect in streaming mode + + >>> lf.group_by("a").agg(pl.all().sum()).collect( + ... streaming=True + ... ) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + DataFrame directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a Parquet file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an IPC file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a CSV file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the + separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + ''' + def sink_ndjson(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an NDJSON file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ndjson("out.ndjson") # doctest: +SKIP + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that `fetch` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if `n_rows` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.String).collect().to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.String}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + ''' + def clone(self) -> Self: + ''' + Create a copy of this LazyFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters; use `name = value` to filter columns by the supplied value. + Each constraint will behave the same as `pl.col(name).eq(value)`, and + will be implicitly joined with the other filter conditions using `&`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") > 1).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> lf.filter( + ... pl.col("foo") == 1, + ... pl.col("ham") == "a", + ... ).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> lf.filter(foo=1, ham="a").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Setting this to `True` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + ''' + def rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `dynamic_group_by` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.rolling(index_column="dt", period="2d") + ... .agg( + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ) + ... .collect() + ... ) + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\', \'outer_coalesce\'} + Join strategy. + + * *inner* + Returns rows that have matching values in both tables + * *left* + Returns all rows from the left table, and the matched rows from the + right table + * *outer* + Returns all rows when there is a match in either left or right table + * *outer_coalesce* + Same as \'outer\', but coalesces the key columns + * *cross* + Returns the cartisian product of rows from both tables + * *semi* + Filter rows that have a match in the right table. + * *anti* + Filter rows that not have a match in the right table. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + join_nulls + Join on null values. By default null values will never produce matches. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 5) + ┌──────┬──────┬──────┬───────┬───────────┐ + │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞══════╪══════╪══════╪═══════╪═══════════╡ + │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │ + │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │ + │ null ┆ null ┆ null ┆ z ┆ d │ + │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │ + └──────┴──────┴──────┴───────┴───────────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this LazyFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ).collect() + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this LazyFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another DataFrame: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context( + ... train_lf.select(pl.all().name.suffix("_train")) + ... ).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + ''' + def drop(self, *columns: ColumnNameOrSelector | Iterable[ColumnNameOrSelector]) -> Self: + ''' + Remove columns from the DataFrame. + + Parameters + ---------- + *columns + Names of the columns that should be removed from the dataframe. + Accepts column selector input. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), + polars will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.shift().collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> lf.shift(-2).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> lf.shift(-2, fill_value=100).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + ''' + def with_row_index(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a row index as the first column in the LazyFrame. + + Parameters + ---------- + name + Name of the index column. + offset + Start the index at this offset. Cannot be negative. + + Warnings + -------- + Using this function can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Notes + ----- + The resulting column does not have any special properties. It is a regular + column of type `UInt32` (or `UInt64` in `polars-u64-idx`). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_index().collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ index ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞═══════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └───────┴─────┴─────┘ + >>> lf.with_row_index("id", offset=1000).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ id ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1000 ┆ 1 ┆ 2 │ + │ 1001 ┆ 3 ┆ 4 │ + │ 1002 ┆ 5 ┆ 6 │ + └──────┴─────┴─────┘ + + An index column can also be created using the expressions :func:`int_range` + and :func:`count`. + + >>> lf.select( + ... pl.int_range(pl.count(), dtype=pl.UInt32).alias("index"), + ... pl.all(), + ... ).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ index ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞═══════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └───────┴─────┴─────┘ + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + .. deprecated:: + Use `meth`:with_row_index` instead. + Note that the default column name has changed from \'row_nr\' to \'index\'. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() # doctest: +SKIP + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + ''' + def gather_every(self, n: int, offset: int = ...) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.gather_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + >>> lf.gather_every(2, offset=1).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill `value` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the DataFrame to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or String datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this DataFrame. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The `schema` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, `predicate_pushdown` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the DataFrame at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + .. warning:: + This functionality is experimental and may change without it being + considered a breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. If set to `None` (default), + the implicit row index of each frame is used as a join key. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + include_nulls + If True, null values from the right DataFrame will be used to update the + left DataFrame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> lf.collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_lf = pl.LazyFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> lf.update(new_lf).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> lf.update(new_lf, how="inner").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update( + ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... ).collect() + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + ''' + def count(self) -> Self: + ''' + Return the number of non-null elements for each column. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... {"a": [1, 2, 3, 4], "b": [1, 2, 1, None], "c": [None, None, None, None]} + ... ) + >>> lf.count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 3 ┆ 0 │ + └─────┴─────┴─────┘ + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + """ + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + """ + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + n + Number of places to shift (may be negative). + """ + def take_every(self, n: int, offset: int = ...) -> Self: + """ + Take every nth row in the LazyFrame and return as a new LazyFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.4/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.4/polars/series/series.pyi new file mode 100644 index 0000000..eedd1b2 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.4/polars/series/series.pyi @@ -0,0 +1,5172 @@ +#: version 0.20.4 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Enum as Enum, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Null as Null, Object as Object, String as String, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, hvplot as hvplot +from polars.exceptions import ModuleUpgradeRequired as ModuleUpgradeRequired, ShapeError as ShapeError +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor, warn_null_comparison as warn_null_comparison +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, Iterable, Literal, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +_HVPLOT_AVAILABLE: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_buffer_info(self) -> BufferInfo: + """ + Return pointer, offset, and length information about the underlying buffer. + + Returns + ------- + tuple of ints + Tuple of the form (pointer, offset, length) + + Raises + ------ + ComputeError + If the `Series` contains multiple chunks. + """ + def _get_buffer(self, index: Literal[0, 1, 2]) -> Self | None: + """ + Return the underlying data, validity, or offsets buffer as a Series. + + The data buffer always exists. + The validity buffer may not exist if the column contains no null values. + The offsets buffer only exists for Series of data type `String` and `List`. + + Parameters + ---------- + index + An index indicating the buffer to return: + + - `0` -> data buffer + - `1` -> validity buffer + - `2` -> offsets buffer + + Returns + ------- + Series or None + `Series` if the specified buffer exists, `None` otherwise. + + Raises + ------ + ComputeError + If the `Series` contains multiple chunks. + """ + def _from_buffer(self, dtype: PolarsDataType, buffer_info: BufferInfo, owner: Any) -> Self: + """ + Construct a Series from information about its underlying buffer. + + Parameters + ---------- + dtype + The data type of the buffer. + buffer_info + Tuple containing buffer information in the form `(pointer, offset, length)`. + owner + The object owning the buffer. + + Returns + ------- + Series + """ + def _from_buffers(self, dtype: PolarsDataType, data: Series | Sequence[Series], validity: Series | None = ...) -> Self: + """ + Construct a Series from information about its underlying buffers. + + Parameters + ---------- + dtype + The data type of the resulting Series. + data + Buffers describing the data. For most data types, this is a single Series of + the physical data type of `dtype`. Some data types require multiple buffers: + + - `String`: A data buffer of type `UInt8` and an offsets buffer + of type `Int64`. + validity + Validity buffer. If specified, must be a Series of data type `Boolean`. + + Returns + ------- + Series + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series <= other`.""" + def lt(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series < other`.""" + def eq(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series == other`.""" + def eq_missing(self, other: Any) -> Series | Expr: + ''' + Method equivalent of equality operator `series == other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + ''' + def ne(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series != other`.""" + def ne_missing(self, other: Any) -> Series | Expr: + ''' + Method equivalent of equality operator `series != other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + ''' + def ge(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series >= other`.""" + def gt(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series > other`.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + ''' + Return the Series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to `s[0]`, with a check + that the shape is (1,). With an index, this is equivalent to `s[index]`. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cum_sum().item(-1) + 24 + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.sqrt() + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.414214 + 1.732051 + ] + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.cbrt() + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.259921 + 1.44225 + ] + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + """ + def log(self, base: float = ...) -> Series: + """ + Compute the logarithm to a given base. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.log() + shape: (3,) + Series: '' [f64] + [ + 0.0 + 0.693147 + 1.098612 + ] + """ + def log1p(self) -> Series: + """ + Compute the natural logarithm of the input array plus one, element-wise. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.log1p() + shape: (3,) + Series: '' [f64] + [ + 0.693147 + 1.098612 + 1.386294 + ] + """ + def log10(self) -> Series: + """ + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> s = pl.Series([10, 100, 1000]) + >>> s.log10() + shape: (3,) + Series: '' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + """ + def exp(self) -> Series: + """ + Compute the exponential, element-wise. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.exp() + shape: (3,) + Series: '' [f64] + [ + 2.718282 + 7.389056 + 20.085537 + ] + """ + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a Series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + Series has a numeric dtype). All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> s = pl.Series([1, 2, 3, 4, 5]) + >>> s.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + Non-numeric data types may not have all statistics available. + + >>> s = pl.Series(["a", "a", None, "b", "c"]) + >>> s.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 4 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + ''' + def mean(self) -> PythonLiteral | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + ''' + def product(self) -> int | float: + ''' + Reduce this Series to the product value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.product() + 6 + ''' + def pow(self, exponent: int | float | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4]) + >>> s.nan_max() + 4 + + >>> s = pl.Series("a", [1, float("nan"), 4]) + >>> s.nan_max() + nan + ''' + def nan_min(self) -> int | float | date | datetime | timedelta | str: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4]) + >>> s.nan_min() + 1 + + >>> s = pl.Series("a", [1, float("nan"), 4]) + >>> s.nan_min() + nan + ''' + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + ''' + def median(self) -> PythonLiteral | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + ''' + def rle(self) -> Series: + ''' + Get the lengths and values of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + See Also + -------- + rle_id + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Get a distinct integer ID for each run of identical values. + + The ID increases by one each time the value of a column (which can be a + :class:`Struct`) changes. + + This is especially useful when you want to define a new group for every time a + column\'s value changes, rather than for every distinct value of that column. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + include_breakpoint + Include a column that indicates the upper breakpoint. + include_category + Include a column that shows the intervals as categories. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬───────┐ + │ break_point ┆ category ┆ count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═══════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴───────┘ + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬───────┐ + │ color ┆ count │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═══════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴───────┘ + + Sort the output by count. + + >>> s.value_counts(sort=True) + shape: (3, 2) + ┌───────┬───────┐ + │ color ┆ count │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═══════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴───────┘ + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + ''' + def cum_max(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cum_max() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + ''' + def cum_min(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cum_min() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + ''' + def cum_prod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_prod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + ''' + def cum_sum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_sum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + ''' + def cum_count(self) -> Self: + ''' + Return the cumulative count of the non-null values in the column. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> s = pl.Series(["x", "k", None, "d"]) + >>> s.cum_count() + shape: (4,) + Series: \'\' [u32] + [ + 1 + 2 + 2 + 3 + ] + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + The resulting series will consist of multiple chunks. + + Parameters + ---------- + other + Series to append. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from `append`, which adds the chunks from `other` to the chunks of + this series, `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer `append` over `extend` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single `Series`. In the latter case, finish the sequence + of `append` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + head + """ + def gather_every(self, n: int, offset: int = ...) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Start the row index at this offset. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + >>> s.gather_every(2, offset=1) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + ''' + def null_count(self) -> int: + """ + Count the null values in this Series. + + Examples + -------- + >>> s = pl.Series([1, None, None]) + >>> s.null_count() + 2 + """ + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no `null` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have `null` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be `false`. + + To confirm that a column has `null` values use :func:`null_count`. + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + Examples + -------- + >>> s = pl.Series([1, 3, 2]) + >>> s.is_sorted() + False + + >>> s = pl.Series([3, 2, 1]) + >>> s.is_sorted(descending=True) + True + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + """ + def equals(self, other: Series) -> bool: + ''' + Check whether the Series is equal to another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + See Also + -------- + assert_series_equal + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s1.equals(s1) + True + >>> s1.equals(s2) + False + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that are between the given lower/upper bounds. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point `nan` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set `zero_copy_only=True`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + ''' + def _view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + + Returns + ------- + SeriesView + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s._view(ignore_nulls=True) + SeriesView([1, 0]) + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + ''' + def count(self) -> int: + ''' + Return the number of non-null elements in the column. + + See Also + -------- + len + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.count() + 2 + ''' + def len(self) -> int: + ''' + Return the number of elements in the Series. + + Null values count towards the total. + + See Also + -------- + count + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.len() + 3 + ''' + def set(self, filter: Series, value: int | float | str | bool | None) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + ''' + def scatter(self, indices: Series | Iterable[int] | int | np.ndarray[Any, Any], values: Series | Iterable[PythonLiteral] | PythonLiteral | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimization (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.scatter(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_index().select( + ... pl.when(pl.col("index") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + ''' + def clone(self) -> Self: + ''' + Create a copy of this Series. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + ''' + def round_sig_figs(self, digits: int) -> Series: + """ + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> s = pl.Series([0.01234, 3.333, 1234.0]) + >>> s.round_sig_figs(2) + shape: (3,) + Series: '' [f64] + [ + 0.012 + 3.3 + 1200.0 + ] + """ + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + ''' + def cot(self) -> Series: + ''' + Compute the element-wise value for the cotangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cot() + shape: (3,) + Series: \'a\' [f64] + [ + inf + 6.1232e-17 + -8.1656e15 + ] + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + ''' + def shift(self, n: int = ...) -> Series: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> s = pl.Series([1, 2, 3, 4]) + >>> s.shift() + shape: (4,) + Series: '' [i64] + [ + null + 1 + 2 + 3 + ] + + Pass a negative value to shift in the opposite direction instead. + + >>> s.shift(-2) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + null + null + ] + + Specify `fill_value` to fill the resulting null values. + + >>> s.shift(-2, fill_value=100) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + 100 + 100 + ] + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their std dev. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + ] + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + + Examples + -------- + >>> s = pl.Series([1, -2, -3]) + >>> s.abs() + shape: (3,) + Series: '' [i64] + [ + 1 + 2 + 3 + ] + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> s = pl.Series([1, 2, 2, 4, 5]) + >>> s.skew() + 0.34776706224699483 + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: + """ + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no lower bound is applied. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no upper bound is applied. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> s = pl.Series([-50, 5, 50, None]) + >>> s.clip(1, 10) + shape: (4,) + Series: '' [i64] + [ + 1 + 5 + 10 + null + ] + + Specifying only a single bound: + + >>> s.clip(upper_bound=10) + shape: (4,) + Series: '' [i64] + [ + -50 + 5 + 10 + null + ] + """ + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + ''' + def replace(self, old: IntoExpr | Sequence[Any] | Mapping[Any, Any], new: IntoExpr | Sequence[Any] | NoDefault = ...) -> Self: + ''' + Replace values by different values. + + Parameters + ---------- + old + Value or sequence of values to replace. + Also accepts a mapping of values to their replacement as syntactic sugar for + `replace(new=Series(mapping.keys()), old=Series(mapping.values()))`. + new + Value or sequence of values to replace by. + Length must match the length of `old` or have length 1. + default + Set values that were not replaced to this value. + Defaults to keeping the original value. + Accepts expression input. Non-expression inputs are parsed as literals. + return_dtype + The data type of the resulting Series. If set to `None` (default), + the data type is determined automatically based on the other inputs. + + See Also + -------- + str.replace + + Notes + ----- + The global string cache must be enabled when replacing categorical values. + + Examples + -------- + Replace a single value by another value. Values that were not replaced remain + unchanged. + + >>> s = pl.Series([1, 2, 2, 3]) + >>> s.replace(2, 100) + shape: (4,) + Series: \'\' [i64] + [ + 1 + 100 + 100 + 3 + ] + + Replace multiple values by passing sequences to the `old` and `new` parameters. + + >>> s.replace([2, 3], [100, 200]) + shape: (4,) + Series: \'\' [i64] + [ + 1 + 100 + 100 + 200 + ] + + Passing a mapping with replacements is also supported as syntactic sugar. + Specify a default to set all values that were not matched. + + >>> mapping = {2: 100, 3: 200} + >>> s.replace(mapping, default=-1) + shape: (4,) + Series: \'\' [i64] + [ + -1 + 100 + 100 + 200 + ] + + + The default can be another Series. + + >>> default = pl.Series([2.5, 5.0, 7.5, 10.0]) + >>> s.replace(2, 100, default=default) + shape: (4,) + Series: \'\' [f64] + [ + 2.5 + 100.0 + 100.0 + 10.0 + ] + + Replacing by values of a different data type sets the return type based on + a combination of the `new` data type and either the original data type or the + default data type if it was set. + + >>> s = pl.Series(["x", "y", "z"]) + >>> mapping = {"x": 1, "y": 2, "z": 3} + >>> s.replace(mapping) + shape: (3,) + Series: \'\' [str] + [ + "1" + "2" + "3" + ] + >>> s.replace(mapping, default=None) + shape: (3,) + Series: \'\' [i64] + [ + 1 + 2 + 3 + ] + + Set the `return_dtype` parameter to control the resulting data type directly. + + >>> s.replace(mapping, return_dtype=pl.UInt8) + shape: (3,) + Series: \'\' [u8] + [ + 1 + 2 + 3 + ] + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.ewm_mean(com=1) + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.666667 + 2.428571 + ] + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.implode() + shape: (1,) + Series: \'a\' [list[i64]] + [ + [1, 2, 3] + ] + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + """ + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() # doctest: +SKIP + True + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_integer()` instead. + For signed/unsigned variants, use `Series.dtype.is_signed_integer()` + or `Series.dtype.is_unsigned_integer()`. + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() # doctest: +SKIP + True + >>> s.is_integer(signed=False) # doctest: +SKIP + True + >>> s.is_integer(signed=True) # doctest: +SKIP + False + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_numeric()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() # doctest: +SKIP + True + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_temporal()` instead. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() # doctest: +SKIP + True + >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP + False + """ + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Boolean` instead. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() # doctest: +SKIP + True + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a String. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.String` instead. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() # doctest: +SKIP + True + ''' + def take_every(self, n: int, offset: int = ...) -> Series: + """ + Take every nth value in the Series and return as new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + Index location used for selection. + """ + def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + """ + Set values at the index locations. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`scatter`. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + """ + def cumsum(self) -> Series: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummax(self) -> Series: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummin(self) -> Series: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cumprod(self) -> Series: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def view(self) -> SeriesView: + """ + Get a view into this Series data with a numpy array. + + .. deprecated:: 0.19.14 + This method will be removed in a future version. + + This operation doesn't clone data, but does not include missing values. + Don't use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in the Series using a remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + """ + def series_equal(self, other: Series) -> bool: + """ + Check whether the Series is equal to another Series. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`equals`. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... + @property + def plot(self): ... +def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: + """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.5/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.5/polars/dataframe/frame.pyi new file mode 100644 index 0000000..6d74956 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.5/polars/dataframe/frame.pyi @@ -0,0 +1,7073 @@ +#: version 0.20.5 +import P +import deltalake +import deltalake.table +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Enum as Enum, Float64 as Float64, Null as Null, Object as Object, String as String, Unknown as Unknown +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, hvplot as hvplot +from polars.exceptions import ModuleUpgradeRequired as ModuleUpgradeRequired, NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, frame_to_pydf as frame_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_parameter_as_positional as deprecate_parameter_as_positional, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _prepare_row_index_args as _prepare_row_index_args, _process_null_values as _process_null_values, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes, warn_null_comparison as warn_null_comparison +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, IO, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_HVPLOT_AVAILABLE: bool +_PANDAS_AVAILABLE: bool +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + """ + @classmethod + def _read_csv(cls, source: str | Path | IO[bytes] | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use `pl.read_csv` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + """ + @classmethod + def _read_parquet(cls, source: str | Path | IO[bytes] | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use `pl.read_parquet` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading `n_rows`. + """ + @classmethod + def _read_ipc(cls, source: str | Path | IO[bytes] | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading `n_rows`. + row_index_name + Row index name. + row_index_offset + Row index offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | IO[bytes] | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading `n_rows`. + row_index_name + Row index name. + row_index_offset + Row index offset. + rechunk + Make sure that all data is contiguous. + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use `pl.read_json` to dispatch to this method. + + See Also + -------- + polars.io.read_json + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use `pl.read_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with `NaN`. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to `True` will raise a `NotImplementedError`. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars DataFrame to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the DataFrame as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to `df[0,0]`, with a check that + the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + ''' + def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are Series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + `structured` is set to `False` and the DataFrame dtypes allow for a + global dtype for all columns. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + function for the conversion to numpy if necessary. + + Notes + ----- + If you\'re attempting to convert String or Decimal to an array, you\'ll need to + install `pyarrow`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Cast to a pandas DataFrame. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow backed-extension arrays instead of numpy arrays for each column + of the pandas DataFrame; this allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy arrays if that operation is not supported by + pyarrow compute functions. + **kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Examples + -------- + >>> import pandas + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> pandas_df1 = df1.to_pandas() + >>> type(pandas_df1) + + >>> pandas_df1.dtypes + foo int64 + bar int64 + ham object + dtype: object + >>> df2 = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6, None, 8], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> pandas_df2 = df2.to_pandas() + >>> pandas_df2 + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + >>> pandas_df2.dtypes + foo float64 + bar float64 + ham object + dtype: object + >>> pandas_df2_pa = df2.to_pandas( + ... use_pyarrow_extension_array=True + ... ) # doctest: +SKIP + >>> pandas_df2_pa # doctest: +SKIP + foo bar ham + 0 1 6 + 1 2 b + 2 8 c + >>> pandas_df2_pa.dtypes # doctest: +SKIP + foo int64[pyarrow] + bar int64[pyarrow] + ham large_string[pyarrow] + dtype: object + ''' + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.String), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.String), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + separator or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path or writeable file-like object to which the data will be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + name + Schema name. Defaults to empty string. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open `xlsxwriter.Workbook` object that has not been closed. + If None, writes to a `dataframe.xlsx` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of `{"key":value,}` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. + column_formats : dict + A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. + dtype_formats : dict + A `{dtype:str,}` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + `column_formats` param). It is also valid to use dtype groups such as + `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid `xlsxwriter` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all `xlsxwriter` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A `{key:value,}` dictionary of `xlsxwriter` format options to apply + to the table header row, such as `{"bold":True, "font_color":"#702963"}`. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a `{colname:funcname,}` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A `{colname:int,}` or `{selector:int,}` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a `{colname:columns,}` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or `{row_index:int,}` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that `row_index` starts at zero and will be + the header row (unless `include_header` is False). + sparklines : dict + A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an `xlsxwriter`-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + include_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible `xlsxwriter` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic DataFrame: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC data will be + written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC record batch data will + be written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + data_page_size + Size of the data page in bytes. Defaults to 1024^2 bytes. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to `pyarrow.parquet.write_table`. + + If you pass `partition_cols` here, the dataset will be written + using `pyarrow.parquet.write_to_dataset`. + The `partition_cols` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + ''' + def write_database(self, table_name: str, connection: str) -> int: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Schema-qualified name of the table to create or append to in the target + SQL database. If your table name contains special characters, it should + be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_table_exists : {\'append\', \'replace\', \'fail\'} + The insert mode: + + * \'replace\' will create a new database table, overwriting an existing one. + * \'append\' will append to an existing table. + * \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine to use for writing frame data. + + Returns + ------- + int + The number of rows affected, if the driver provides this information. + Otherwise, returns -1. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> deltalake.table.TableMerger | None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\', \'merge\'} + How to handle existing data. + + - If \'error\', throw an error if the table already exists (default). + - If \'append\', will add new data. + - If \'overwrite\', will replace table with new data. + - If \'ignore\', will not write anything if table already exists. + - If \'merge\', return a `TableMerger` object to merge data from the DataFrame + with the existing data. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + - See a list of supported storage options for S3 `here `__. + - See a list of supported storage options for GCS `here `__. + - See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + delta_merge_options + Keyword arguments which are required to `MERGE` a Delta lake Table. + See a list of supported merge options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + TableNotFoundError + If the delta table doesn\'t exist and MERGE action is triggered + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a DataFrame as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + Merge the DataFrame with an existing Delta Lake table. + For all `TableMerger` methods, check the deltalake docs + `here `__. + + Schema evolution is not yet supported in by the `deltalake` package, therefore + `overwrite_schema` will not have any effect on a merge operation. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> ( + ... df.write_delta( + ... "table_path", + ... mode="merge", + ... delta_merge_options={ + ... "predicate": "s.foo = t.foo", + ... "source_alias": "s", + ... "target_alias": "t", + ... }, + ... ) + ... .when_matched_update_all() + ... .when_not_matched_insert_all() + ... .execute() + ... ) # doctest: +SKIP + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.String)], + ... ) + >>> df.estimated_size() + 25888898 + >>> df.estimated_size("mb") + 24.689577102661133 + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 4 ┆ 5 ┆ 6 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["x", "y", "z"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["x", "y", "z"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 4 ┆ 5 ┆ 6 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["i", "j", "k"], a=[1, 2, 3], b=[4, 5, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ i ┆ j ┆ k │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ i ┆ j ┆ k │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 4 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + ''' + def rename(self, mapping: dict[str, str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + ''' + def insert_column(self, index: int, column: Series) -> Self: + ''' + Insert a Series at a certain column index. + + This operation is in place. + + Parameters + ---------- + index + Index at which to insert the new `Series` column. + column + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_column(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_column(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: + ''' + Filter the rows in the DataFrame based on one or more predicate expressions. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression(s) that evaluates to a boolean Series. + constraints + Column filters; use `name = value` to filter columns by the supplied value. + Each constraint will behave the same as `pl.col(name).eq(value)`, and + will be implicitly joined with the other filter conditions using `&`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") > 1) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions, combined with and/or operators: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> df.filter( + ... pl.col("foo") <= 2, + ... ~pl.col("ham").is_in(["b", "c"]), + ... ) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> df.filter(foo=2, ham="b") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> Self: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Warnings + -------- + We will never guarantee the output of describe to be stable. + It will show statistics that we deem informative and may + be updated in the future. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "float": [1.0, 2.8, 3.0], + ... "int": [4, 5, None], + ... "bool": [True, False, True], + ... "str": [None, "b", "c"], + ... "str2": ["usd", "eur", None], + ... "date": [date(2020, 1, 1), date(2021, 1, 1), date(2022, 1, 1)], + ... } + ... ) + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬───────┬──────┬──────┬────────────┐ + │ describe ┆ float ┆ int ┆ bool ┆ str ┆ str2 ┆ date │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ str ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪═══════╪══════╪══════╪════════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3 ┆ 2 ┆ 2 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 4.5 ┆ null ┆ null ┆ null ┆ null │ + │ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 4.0 ┆ False ┆ b ┆ eur ┆ 2020-01-01 │ + │ 25% ┆ 2.8 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │ + │ 50% ┆ 2.8 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │ + │ max ┆ 3.0 ┆ 5.0 ┆ True ┆ c ┆ usd ┆ 2022-01-01 │ + └────────────┴──────────┴──────────┴───────┴──────┴──────┴────────────┘ + ''' + def get_column_index(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.get_column_index("ham") + 2 + ''' + def replace_column(self, index: int, column: Series) -> Self: + ''' + Replace a column at an index location. + + This operation is in place. + + Parameters + ---------- + index + Column index. + column + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_column(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If `descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the `k` smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If `descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the `k` largest. Bottom-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + ''' + def equals(self, other: DataFrame) -> bool: + ''' + Check whether the DataFrame is equal to another DataFrame. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + See Also + -------- + assert_frame_equal + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.equals(df1) + True + >>> df1.equals(df2) + False + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + head + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + ''' + def with_row_index(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a row index as the first column in the DataFrame. + + Parameters + ---------- + name + Name of the index column. + offset + Start the index at this offset. Cannot be negative. + + Notes + ----- + The resulting column does not have any special properties. It is a regular + column of type `UInt32` (or `UInt64` in `polars-u64-idx`). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_index() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ index ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞═══════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └───────┴─────┴─────┘ + >>> df.with_row_index("id", offset=1000) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ id ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1000 ┆ 1 ┆ 2 │ + │ 1001 ┆ 3 ┆ 4 │ + │ 1002 ┆ 5 ┆ 6 │ + └──────┴─────┴─────┘ + + An index column can also be created using the expressions :func:`int_range` + and :func:`len`. + + >>> df.select( + ... pl.int_range(pl.len(), dtype=pl.UInt32).alias("index"), + ... pl.all(), + ... ) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ index ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞═══════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └───────┴─────┴─────┘ + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + .. deprecated:: + Use :meth:`with_row_index` instead. + Note that the default column name has changed from \'row_nr\' to \'index\'. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() # doctest: +SKIP + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The `GroupBy` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `group_by_dynamic` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling operation on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + Time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + Interval will start \'every\' duration. + offset + Change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group. + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\', \'outer_coalesce\'} + Join strategy. + + * *inner* + Returns rows that have matching values in both tables + * *left* + Returns all rows from the left table, and the matched rows from the + right table + * *outer* + Returns all rows when there is a match in either left or right table + * *outer_coalesce* + Same as \'outer\', but coalesces the key columns + * *cross* + Returns the cartisian product of rows from both tables + * *semi* + Filter rows that have a match in the right table. + * *anti* + Filter rows that not have a match in the right table. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + join_nulls + Join on null values. By default null values will never produce matches. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 5) + ┌──────┬──────┬──────┬───────┬───────────┐ + │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞══════╪══════╪══════╪═══════╪═══════════╡ + │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │ + │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │ + │ null ┆ null ┆ null ┆ z ┆ d │ + │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │ + └──────┴──────┴──────┴───────┴───────────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see `pl.StringCache()`. + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: `udf(row)`. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level `apply` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level `apply` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + ''' + def vstack(self, other: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of + this `DataFrame`, `extend` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer `vstack` over `extend` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single `DataFrame`. In the latter case, finish the sequence of + `vstack` operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + ''' + def drop(self, *columns: ColumnNameOrSelector | Iterable[ColumnNameOrSelector]) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + *columns + Names of the columns that should be removed from the dataframe. + Accepts column selector input. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.String).to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.String}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + ''' + def clone(self) -> Self: + ''' + Create a copy of this DataFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Returns + ------- + Series + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill `value`. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or String datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'min\', \'max\', \'first\', \'last\', \'sum\', \'mean\', \'median\', \'len\'} + - An expression to do the aggregation. + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> df.melt(id_vars="a", value_vars=cs.numeric()) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Warnings + -------- + This functionality is experimental and may be subject to changes + without it being considered a breaking change. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to `None` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are tuples of + the distinct group values that identify each group. If a single string + was passed to `by`, the keys are a single value instead of a tuple. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying `as_dict=True`. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {(\'a\',): shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + (\'b\',): shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + (\'c\',): shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + ''' + def shift(self, n: int = ...) -> DataFrame: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> df.shift() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.shift(-2) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.shift(-2, fill_value=100) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ) + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this DataFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ) + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + See Also + -------- + with_columns + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + ''' + def max(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`max_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + ''' + def max_horizontal(self) -> Series: + ''' + Get the maximum value horizontally across columns. + + Returns + ------- + Series + A Series named `"max"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.max_horizontal() + shape: (3,) + Series: \'max\' [f64] + [ + 4.0 + 5.0 + 6.0 + ] + ''' + def min(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`min_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + ''' + def min_horizontal(self) -> Series: + ''' + Get the minimum value horizontally across columns. + + Returns + ------- + Series + A Series named `"min"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.min_horizontal() + shape: (3,) + Series: \'min\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`sum_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + ''' + def sum_horizontal(self) -> Series: + ''' + Sum all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"sum"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.sum_horizontal() + shape: (3,) + Series: \'sum\' [f64] + [ + 5.0 + 7.0 + 9.0 + ] + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`mean_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + ''' + def mean_horizontal(self) -> Series: + ''' + Take the mean of all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"mean"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.mean_horizontal() + shape: (3,) + Series: \'mean\' [f64] + [ + 2.5 + 3.5 + 4.5 + ] + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to `None` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the `DataFrame` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + - Int8 + String = String + - Float32 + Int64 = Float32 + - Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The `index` and `by_predicate` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using `by_predicate` it is an error condition if anything other than + one row is returned; more than one row raises `TooManyRowsReturnedError`, and + zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of `iter_rows()` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify `named=True` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use `by_predicate` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using `iter_rows` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materializing all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialize all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialize all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + ''' + def iter_columns(self) -> Iterator[Series]: + ''' + Returns an iterator over the DataFrame\'s columns. + + Notes + ----- + Consider whether you can use :func:`all` instead. + If you can, it will be more efficient. + + Returns + ------- + Iterator of Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [s.name for s in df.iter_columns()] + [\'a\', \'b\'] + + If you\'re using this to modify a dataframe\'s columns, e.g. + + >>> # Do NOT do this + >>> pl.DataFrame(column * 2 for column in df.iter_columns()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + + then consider whether you can use :func:`all` instead: + + >>> df.select(pl.all() * 2) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + """ + def gather_every(self, n: int, offset: int = ...) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.gather_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + >>> s.gather_every(2, offset=1) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash_rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + ''' + def to_struct(self, name: str = ...) -> Series: + ''' + Convert a `DataFrame` to a `Series` of type `Struct`. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy `corrcoef` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy `corrcoef`. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the values in `other`. + + .. warning:: + This functionality is experimental and may change without it being + considered a breaking change. + + By default, null values in the right frame are ignored. Use + `include_nulls=False` to overwrite values in this frame with + null values in the other frame. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. If set to `None` (default), + the implicit row index of each frame is used as a join key. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce + when `include_nulls = False` + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> df.update(new_df, how="inner") + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update( + ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + ''' + def count(self) -> DataFrame: + ''' + Return the number of non-null elements for each column. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"a": [1, 2, 3, 4], "b": [1, 2, 1, None], "c": [None, None, None, None]} + ... ) + >>> df.count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 3 ┆ 0 │ + └─────┴─────┴─────┘ + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + """ + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + """ + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with this value. + n + Number of places to shift (may be negative). + """ + def take_every(self, n: int, offset: int = ...) -> DataFrame: + """ + Take every nth row in the DataFrame and return as a new DataFrame. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + def find_idx_by_name(self, name: str) -> int: + """ + Find the index of a column by name. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`get_column_index`. + + Parameters + ---------- + name + Name of the column to find. + """ + def insert_at_idx(self, index: int, column: Series) -> Self: + """ + Insert a Series at a certain column index. This operation is in place. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`insert_column`. + + Parameters + ---------- + index + Column to insert the new `Series` column. + column + `Series` to insert. + """ + def replace_at_idx(self, index: int, new_column: Series) -> Self: + """ + Replace a column at an index location. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`replace_column`. + + Parameters + ---------- + index + Column index. + new_column + Series that will replace the column. + """ + def frame_equal(self, other: DataFrame) -> bool: + """ + Check whether the DataFrame is equal to another DataFrame. + + .. deprecated:: 0.19.16 + This method has been renamed to :func:`equals`. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + """ + @property + def plot(self): ... + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.5/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.5/polars/expr/expr.pyi new file mode 100644 index 0000000..abfe423 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.5/polars/expr/expr.pyi @@ -0,0 +1,8394 @@ +#: version 0.20.5 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Int64 as Int64 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions, parse_predicates_constraints_as_expression as parse_predicates_constraints_as_expression +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.various import no_default as no_default, sphinx_accessor as sphinx_accessor, warn_null_comparison as warn_null_comparison +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + class _map_batches_wrapper: + def __init__(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None) -> None: ... + def __call__(self, *args: Any, **kwargs: Any) -> Any: ... + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: IntoExpr) -> Self: ... + def __radd__(self, other: IntoExpr) -> Self: ... + def __and__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __rand__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __eq__(self, other: IntoExpr) -> Self: ... + def __floordiv__(self, other: IntoExpr) -> Self: ... + def __rfloordiv__(self, other: IntoExpr) -> Self: ... + def __ge__(self, other: IntoExpr) -> Self: ... + def __gt__(self, other: IntoExpr) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: IntoExpr) -> Self: ... + def __lt__(self, other: IntoExpr) -> Self: ... + def __mod__(self, other: IntoExpr) -> Self: ... + def __rmod__(self, other: IntoExpr) -> Self: ... + def __mul__(self, other: IntoExpr) -> Self: ... + def __rmul__(self, other: IntoExpr) -> Self: ... + def __ne__(self, other: IntoExpr) -> Self: ... + def __neg__(self) -> Expr: ... + def __or__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __ror__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, exponent: IntoExprColumn | int | float) -> Self: ... + def __rpow__(self, base: IntoExprColumn | int | float) -> Expr: ... + def __sub__(self, other: IntoExpr) -> Self: ... + def __rsub__(self, other: IntoExpr) -> Self: ... + def __truediv__(self, other: IntoExpr) -> Self: ... + def __rtruediv__(self, other: IntoExpr) -> Self: ... + def __xor__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __rxor__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + ''' + def any(self) -> Self: + ''' + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + ''' + def all(self) -> Self: + ''' + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.map`. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + keep_name + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.prefix`. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.suffix`. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.keep`. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).name.keep()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with `^` and end with `$`. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns( + ... pl.all().is_not_null().name.suffix("_not_null") # nan != null + ... ) + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + ''' + def count(self) -> Self: + ''' + Return the number of non-null elements in the column. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + See Also + -------- + len + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 2 │ + └─────┴─────┘ + ''' + def len(self) -> Self: + ''' + Return the number of elements in the column. + + Null values count towards the total. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + See Also + -------- + count + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + ''' + def cum_sum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_sum().alias("cum_sum"), + ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_sum ┆ cum_sum_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 10 │ + │ 2 ┆ 3 ┆ 9 │ + │ 3 ┆ 6 ┆ 7 │ + │ 4 ┆ 10 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_sum().alias("value_cum_sum"), + ... pl.col("values") + ... .cum_sum() + ... .forward_fill() + ... .alias("value_cum_sum_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬───────────────┬──────────────────────────┐ + │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═══════════════╪══════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴───────────────┴──────────────────────────┘ + ''' + def cum_prod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_prod().alias("cum_prod"), + ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), + ... ) + shape: (4, 3) + ┌─────┬──────────┬──────────────────┐ + │ a ┆ cum_prod ┆ cum_prod_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════════╪══════════════════╡ + │ 1 ┆ 1 ┆ 24 │ + │ 2 ┆ 2 ┆ 24 │ + │ 3 ┆ 6 ┆ 12 │ + │ 4 ┆ 24 ┆ 4 │ + └─────┴──────────┴──────────────────┘ + ''' + def cum_min(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_min().alias("cum_min"), + ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_min ┆ cum_min_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 1 ┆ 3 │ + │ 4 ┆ 1 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + ''' + def cum_max(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_max().alias("cum_max"), + ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_max ┆ cum_max_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 2 ┆ 4 │ + │ 3 ┆ 3 ┆ 4 │ + │ 4 ┆ 4 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_max().alias("cum_max"), + ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬─────────┬────────────────────┐ + │ values ┆ cum_max ┆ cum_max_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════════╪════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴─────────┴────────────────────┘ + ''' + def cum_count(self) -> Self: + ''' + Return the cumulative count of the non-null values in the column. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": ["x", "k", None, "d"]}) + >>> df.with_columns( + ... pl.col("a").cum_count().alias("cum_count"), + ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), + ... ) + shape: (4, 3) + ┌──────┬───────────┬───────────────────┐ + │ a ┆ cum_count ┆ cum_count_reverse │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u32 ┆ u32 │ + ╞══════╪═══════════╪═══════════════════╡ + │ x ┆ 1 ┆ 3 │ + │ k ┆ 2 ┆ 2 │ + │ null ┆ 2 ┆ 1 │ + │ d ┆ 3 ┆ 1 │ + └──────┴───────────┴───────────────────┘ + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + ''' + def round_sig_figs(self, digits: int) -> Self: + ''' + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) + >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) + shape: (3, 2) + ┌─────────┬────────────────┐ + │ a ┆ round_sig_figs │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════════╪════════════════╡ + │ 0.01234 ┆ 0.012 │ + │ 3.333 ┆ 3.3 │ + │ 1234.0 ┆ 1200.0 │ + └─────────┴────────────────┘ + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + See Also + -------- + Expr.get : Take a single value + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg( + ... pl.col("value").gather([2, 1]) + ... ) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ one ┆ [2, 98] │ + │ two ┆ [4, 99] │ + └───────┴───────────┘ + ''' + def get(self, index: int | Expr) -> Self: + ''' + Return a single value by index. + + Parameters + ---------- + index + An expression that leads to a UInt32 index. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns(shift=pl.col("a").shift()) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴───────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.with_columns(shift=pl.col("a").shift(-2)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ null │ + │ 4 ┆ null │ + └─────┴───────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ 100 │ + │ 4 ┆ 100 │ + └─────┴───────┘ + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().name.suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Notes + ----- + `null` is considered to be a unique value for the purposes of this operation. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 1, 2, 2, 3], "y": [1, 1, 1, None, None]}) + >>> df.select( + ... x_unique=pl.col("x").n_unique(), + ... y_unique=pl.col("y").n_unique(), + ... ) + shape: (1, 2) + ┌──────────┬──────────┐ + │ x_unique ┆ y_unique │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞══════════╪══════════╡ + │ 3 ┆ 2 │ + └──────────┴──────────┘ + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"n": [1, 1, 2]}) + >>> df.select(pl.col("n").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ n │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + >>> df = pl.DataFrame({"n": range(1000)}) + >>> df.select( + ... exact=pl.col("n").n_unique(), + ... approx=pl.col("n").approx_n_unique(), + ... ) # doctest: +SKIP + shape: (1, 2) + ┌───────┬────────┐ + │ exact ┆ approx │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═══════╪════════╡ + │ 1000 ┆ 1005 │ + └───────┴────────┘ + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [10, None, 300], + ... "c": [350, 650, 850], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns( + ... pl.col("c").max().over("a").name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns( + ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns( + ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns( + ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + ''' + def rolling(self, index_column: str) -> Self: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + ''' + def rle(self) -> Self: + ''' + Get the lengths and values of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + See Also + -------- + rle_id + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Get a distinct integer ID for each run of identical values. + + The ID increases by one each time the value of a column (which can be a + :class:`Struct`) changes. + + This is especially useful when you want to define a new group for every time a + column\'s value changes, rather than for every distinct value of that column. + + See Also + -------- + rle + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn], **constraints: Any) -> Self: + ''' + Filter the expression based on one or more predicate expressions. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicates + Expression(s) that evaluates to a boolean Series. + constraints + Column filters; use `name = value` to filter columns by the supplied value. + Each constraint will behave the same as `pl.col(name).eq(value)`, and + will be implicitly joined with the other filter conditions using `&`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), + ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + Filter expressions can also take constraints as keyword arguments. + + >>> import polars.selectors as cs + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "a", "a", "a", "b", "b", "b", "b", "b"], + ... "n": [1, 2, 2, 3, 1, 3, 3, 2, 3], + ... }, + ... ) + >>> df.group_by("key").agg( + ... n_1=pl.col("n").filter(n=1).sum(), + ... n_2=pl.col("n").filter(n=2).sum(), + ... n_3=pl.col("n").filter(n=3).sum(), + ... ).sort(by="key") + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ key ┆ n_1 ┆ n_2 ┆ n_3 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 4 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 9 │ + └─────┴─────┴─────┴─────┘ + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + .. deprecated:: 0.20.4 + Use :func:`filter` instead. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( # doctest: +SKIP + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series (or a NumPy array, in which + case it will be automatically converted into a Series). If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for `map` functions is transforming the values + represented by an expression using a third-party library. + + .. warning:: + If you are looking to map a function over a window function or group_by + context, refer to :func:`map_elements` instead. + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + is_elementwise + If set to true this can run in the streaming engine, but may yield + incorrect results in group-by. Ensure you know what you are doing! + agg_list + Aggregate the values of the expression into a list before applying the + function. This parameter only works in a group-by context. + The function will be invoked only once on a list of groups, rather than + once per group. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_elements + replace + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + In a group-by context, the `agg_list` parameter can improve performance if used + correctly. The following example has `agg_list` set to `False`, which causes + the function to be applied once per group. The input of the function is a + Series of type `Int64`. This is less efficient. + + >>> df = pl.DataFrame( + ... { + ... "a": [0, 1, 0, 1], + ... "b": [1, 2, 3, 4], + ... } + ... ) + >>> df.group_by("a").agg( + ... pl.col("b").map_batches(lambda x: x.max(), agg_list=False) + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ list[i64] │ + ╞═════╪═══════════╡ + │ 1 ┆ [4] │ + │ 0 ┆ [3] │ + └─────┴───────────┘ + + Using `agg_list=True` would be more efficient. In this example, the input of + the function is a Series of type `List(Int64)`. + + >>> df.group_by("a").agg( + ... pl.col("b").map_batches(lambda x: x.list.max(), agg_list=True) + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴─────┘ + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type `Callable[[Any], Any]`. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type `Callable[[Series], Any]`. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be `pl.Unknown`. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + This functionality is considered experimental and may be removed/changed. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + Notes + ----- + * Using `map_elements` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using `over` is considered a GroupBy context + here, so `map_elements` can be used to map functions over window groups. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using `over` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort("key") # doctest: +IGNORE_RESULT + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + ''' + def gather_every(self, n: int, offset: int = ...) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").gather_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + >>> df.select(pl.col("foo").gather_every(3, offset=1)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 5 │ + │ 8 │ + └─────┘ + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator `expr & other & ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator `expr | other | ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other` where `None == None`. + + This differs from default `eq` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator `expr >= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ true │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator `expr > other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator `expr <= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator `expr < other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator `expr != other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr != other` where `None == None`. + + This differs from default `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator `expr + other`. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator `expr // other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator `expr % other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator `expr * other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator `expr - other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator `expr / other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + ''' + def pow(self, exponent: IntoExprColumn | int | float) -> Self: + ''' + Method equivalent of exponentiation operator `expr ** exponent`. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator `expr ^ other`. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) + shape: (3, 3) + ┌───────────┬──────────────────┬──────────┐ + │ sets ┆ optional_members ┆ contains │ + │ --- ┆ --- ┆ --- │ + │ list[i64] ┆ i64 ┆ bool │ + ╞═══════════╪══════════════════╪══════════╡ + │ [1, 2, 3] ┆ 1 ┆ true │ + │ [1, 2] ┆ 2 ┆ true │ + │ [9, 10] ┆ 3 ┆ false │ + └───────────┴──────────────────┴──────────┘ + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given lower and upper bounds. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with `lit` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ NaN │ + │ 3.0 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ 5 ┆ 10.0 │ + │ 6 ┆ 12.0 │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("index").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 18 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └───────┴─────────────────────┴─────────────────┘ + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("index").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 19 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("index").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 4 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 20 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └───────┴─────────────────────┴─────────────────┘ + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("index").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬──────────────────┐ + │ index ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 2.5 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 18.5 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └───────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("index").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬──────────────────┐ + │ index ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 3.0 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 19.0 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └───────┴─────────────────────┴──────────────────┘ + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("index").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 5 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 37 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("index").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 9 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 57 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └───────┴─────────────────────┴─────────────────┘ + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("index").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 0.707107 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("index").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 1.0 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └───────┴─────────────────────┴─────────────────┘ + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("index").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 0.5 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("index").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 1.0 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └───────┴─────────────────────┴─────────────────┘ + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + Experimental. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: + ''' + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) + >>> df.with_columns(clip=pl.col("a").clip(1, 10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + Specifying only a single bound: + + >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + ''' + def cot(self) -> Self: + ''' + Compute the element-wise value for the cotangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cot().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 0.64 │ + └──────┘ + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + ''' + def sample(self, n: int | IntoExprColumn | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + ''' + def ewm_mean(self) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + ''' + def ewm_std(self) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + ''' + def ewm_var(self) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + ''' + def value_counts(self) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + ''' + def hist(self, bins: IntoExpr | None = ...) -> Self: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + include_breakpoint + Include a column that indicates the upper breakpoint. + include_category + Include a column that shows the intervals as categories. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 3, 8, 8, 2, 1, 3]}) + >>> df.select(pl.col("a").hist(bins=[1, 2, 3])) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 2 │ + │ 2 │ + └─────┘ + >>> df.select( + ... pl.col("a").hist( + ... bins=[1, 2, 3], include_breakpoint=True, include_category=True + ... ) + ... ) + shape: (4, 1) + ┌───────────────────────┐ + │ a │ + │ --- │ + │ struct[3] │ + ╞═══════════════════════╡ + │ {1.0,"(-inf, 1.0]",2} │ + │ {2.0,"(1.0, 2.0]",1} │ + │ {3.0,"(2.0, 3.0]",2} │ + │ {inf,"(3.0, inf]",2} │ + └───────────────────────┘ + ''' + def replace(self, old: IntoExpr | Sequence[Any] | Mapping[Any, Any], new: IntoExpr | Sequence[Any] | NoDefault = ...) -> Self: + ''' + Replace values by different values. + + Parameters + ---------- + old + Value or sequence of values to replace. + Accepts expression input. Sequences are parsed as Series, + other non-expression inputs are parsed as literals. + Also accepts a mapping of values to their replacement as syntactic sugar for + `replace(new=Series(mapping.keys()), old=Series(mapping.values()))`. + new + Value or sequence of values to replace by. + Accepts expression input. Sequences are parsed as Series, + other non-expression inputs are parsed as literals. + Length must match the length of `old` or have length 1. + default + Set values that were not replaced to this value. + Defaults to keeping the original value. + Accepts expression input. Non-expression inputs are parsed as literals. + return_dtype + The data type of the resulting expression. If set to `None` (default), + the data type is determined automatically based on the other inputs. + + See Also + -------- + str.replace + + Notes + ----- + The global string cache must be enabled when replacing categorical values. + + Examples + -------- + Replace a single value by another value. Values that were not replaced remain + unchanged. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) + >>> df.with_columns(replaced=pl.col("a").replace(2, 100)) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 3 │ + └─────┴──────────┘ + + Replace multiple values by passing sequences to the `old` and `new` parameters. + + >>> df.with_columns(replaced=pl.col("a").replace([2, 3], [100, 200])) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 200 │ + └─────┴──────────┘ + + Passing a mapping with replacements is also supported as syntactic sugar. + Specify a default to set all values that were not matched. + + >>> mapping = {2: 100, 3: 200} + >>> df.with_columns(replaced=pl.col("a").replace(mapping, default=-1)) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ -1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 200 │ + └─────┴──────────┘ + + Replacing by values of a different data type sets the return type based on + a combination of the `new` data type and either the original data type or the + default data type if it was set. + + >>> df = pl.DataFrame({"a": ["x", "y", "z"]}) + >>> mapping = {"x": 1, "y": 2, "z": 3} + >>> df.with_columns(replaced=pl.col("a").replace(mapping)) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + >>> df.with_columns(replaced=pl.col("a").replace(mapping, default=None)) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + + Set the `return_dtype` parameter to control the resulting data type directly. + + >>> df.with_columns( + ... replaced=pl.col("a").replace(mapping, return_dtype=pl.UInt8) + ... ) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ u8 │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + + Expression input is supported for all parameters. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3], "b": [1.5, 2.5, 5.0, 1.0]}) + >>> df.with_columns( + ... replaced=pl.col("a").replace( + ... old=pl.col("a").max(), + ... new=pl.col("b").sum(), + ... default=pl.col("b"), + ... ) + ... ) + shape: (4, 3) + ┌─────┬─────┬──────────┐ + │ a ┆ b ┆ replaced │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═════╪══════════╡ + │ 1 ┆ 1.5 ┆ 1.5 │ + │ 2 ┆ 2.5 ┆ 2.5 │ + │ 2 ┆ 5.0 ┆ 5.0 │ + │ 3 ┆ 1.0 ┆ 10.0 │ + └─────┴─────┴──────────┘ + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + """ + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + `polars.Unknown`. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + """ + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + """ + def register_plugin(self) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by `lib::symbol`. + + The parameters you give dictate how polars will deal + with the function. Make sure they are correct! + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + These arguments have to be of type Expression. + kwargs + Non-expression arguments. They must be JSON serializable. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + returns_scalar + Automatically explode on unit length if it ran as final aggregation. + this is the case for aggregations like `sum`, `min`, `covariance` etc. + cast_to_supertypes + Cast the input datatypes to their supertype. + pass_name_to_apply + if set, then the `Series` passed to the function in the group_by operation + will ensure the name is set. This is an extra heap allocation per group. + changes_length + For example a `unique` or a `slice` + """ + def _register_plugin(self) -> Self: ... + def take_every(self, n: int, offset: int = ...) -> Self: + """ + Take every nth value in the Series and return as a new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + """ + def cumsum(self) -> Self: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumprod(self) -> Self: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummin(self) -> Self: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummax(self) -> Self: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumcount(self) -> Self: + """ + Get an array with the cumulative count computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_count`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in column according to remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def name(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.5/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.5/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..8f9f696 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.5/polars/lazyframe/frame.pyi @@ -0,0 +1,4211 @@ +#: version 0.20.5 +import P +import np +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Duration as Duration, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, String as String, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8 +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.lazyframe.in_process import InProcessQuery as InProcessQuery +from polars.selectors import _expand_selectors as _expand_selectors, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_parameter_as_positional as deprecate_parameter_as_positional, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_index_args as _prepare_row_index_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use `pl.scan_csv` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + """ + @classmethod + def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use `pl.scan_parquet` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + """ + @classmethod + def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use `pl.scan_ipc` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + """ + @classmethod + def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use `pl.scan_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to `True`. + If this is set to `True` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and pass on + the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the LazyFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If `descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the `k` smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If `descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the `k` largest. Bottom-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + ''' + def collect(self) -> DataFrame | InProcessQuery: + ''' + Materialize this LazyFrame into a DataFrame. + + By default, all query optimizations are enabled. Individual optimizations may + be disabled by setting the corresponding parameter to `False`. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + no_optimization + Turn off (certain) optimizations. + streaming + Process the query in batches to handle larger-than-memory data. + If set to `False` (default), the entire query is processed in a single + batch. + + .. warning:: + This functionality is currently in an alpha state. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + background + Run the query in the background and get a handle to the query. + This handle can be used to fetch the result or cancel the query. + + Returns + ------- + DataFrame + + See Also + -------- + fetch: Run the query on the first `n` rows only for debugging purposes. + explain : Print the query plan that is evaluated with collect. + profile : Collect the LazyFrame and time each node in the computation graph. + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.Config.set_streaming_chunk_size : Set the size of streaming batches. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + Collect in streaming mode + + >>> lf.group_by("a").agg(pl.all().sum()).collect( + ... streaming=True + ... ) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + DataFrame directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a Parquet file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an IPC file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a CSV file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the + separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + ''' + def sink_ndjson(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an NDJSON file. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ndjson("out.ndjson") # doctest: +SKIP + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that `fetch` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if `n_rows` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.String).collect().to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.String}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + ''' + def clone(self) -> Self: + ''' + Create a copy of this LazyFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters; use `name = value` to filter columns by the supplied value. + Each constraint will behave the same as `pl.col(name).eq(value)`, and + will be implicitly joined with the other filter conditions using `&`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") > 1).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> lf.filter( + ... pl.col("foo") == 1, + ... pl.col("ham") == "a", + ... ).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> lf.filter(foo=1, ham="a").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Setting this to `True` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + ''' + def rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `dynamic_group_by` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.rolling(index_column="dt", period="2d") + ... .agg( + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ) + ... .collect() + ... ) + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\', \'outer_coalesce\'} + Join strategy. + + * *inner* + Returns rows that have matching values in both tables + * *left* + Returns all rows from the left table, and the matched rows from the + right table + * *outer* + Returns all rows when there is a match in either left or right table + * *outer_coalesce* + Same as \'outer\', but coalesces the key columns + * *cross* + Returns the cartisian product of rows from both tables + * *semi* + Filter rows that have a match in the right table. + * *anti* + Filter rows that not have a match in the right table. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + join_nulls + Join on null values. By default null values will never produce matches. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 5) + ┌──────┬──────┬──────┬───────┬───────────┐ + │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞══════╪══════╪══════╪═══════╪═══════════╡ + │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │ + │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │ + │ null ┆ null ┆ null ┆ z ┆ d │ + │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │ + └──────┴──────┴──────┴───────┴───────────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this LazyFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ).collect() + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this LazyFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another DataFrame: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context( + ... train_lf.select(pl.all().name.suffix("_train")) + ... ).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + ''' + def drop(self, *columns: ColumnNameOrSelector | Iterable[ColumnNameOrSelector]) -> Self: + ''' + Remove columns from the DataFrame. + + Parameters + ---------- + *columns + Names of the columns that should be removed from the dataframe. + Accepts column selector input. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + ''' + def rename(self, mapping: dict[str, str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name. + + Notes + ----- + If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), + polars will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.shift().collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> lf.shift(-2).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> lf.shift(-2, fill_value=100).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + ''' + def with_row_index(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a row index as the first column in the LazyFrame. + + Parameters + ---------- + name + Name of the index column. + offset + Start the index at this offset. Cannot be negative. + + Warnings + -------- + Using this function can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Notes + ----- + The resulting column does not have any special properties. It is a regular + column of type `UInt32` (or `UInt64` in `polars-u64-idx`). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_index().collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ index ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞═══════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └───────┴─────┴─────┘ + >>> lf.with_row_index("id", offset=1000).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ id ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1000 ┆ 1 ┆ 2 │ + │ 1001 ┆ 3 ┆ 4 │ + │ 1002 ┆ 5 ┆ 6 │ + └──────┴─────┴─────┘ + + An index column can also be created using the expressions :func:`int_range` + and :func:`len`. + + >>> lf.select( + ... pl.int_range(pl.len(), dtype=pl.UInt32).alias("index"), + ... pl.all(), + ... ).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ index ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞═══════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └───────┴─────┴─────┘ + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + .. deprecated:: + Use :meth:`with_row_index` instead. + Note that the default column name has changed from \'row_nr\' to \'index\'. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() # doctest: +SKIP + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + ''' + def gather_every(self, n: int, offset: int = ...) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.gather_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + >>> lf.gather_every(2, offset=1).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill `value` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the DataFrame to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or String datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this DataFrame. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The `schema` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, `predicate_pushdown` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the DataFrame at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + .. warning:: + This functionality is experimental and may change without it being + considered a breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. If set to `None` (default), + the implicit row index of each frame is used as a join key. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + include_nulls + If True, null values from the right DataFrame will be used to update the + left DataFrame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> lf.collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_lf = pl.LazyFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> lf.update(new_lf).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> lf.update(new_lf, how="inner").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update( + ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... ).collect() + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + ''' + def count(self) -> Self: + ''' + Return the number of non-null elements for each column. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... {"a": [1, 2, 3, 4], "b": [1, 2, 1, None], "c": [None, None, None, None]} + ... ) + >>> lf.count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 3 ┆ 0 │ + └─────┴─────┴─────┘ + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + """ + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + """ + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + n + Number of places to shift (may be negative). + """ + def take_every(self, n: int, offset: int = ...) -> Self: + """ + Take every nth row in the LazyFrame and return as a new LazyFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.5/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.5/polars/series/series.pyi new file mode 100644 index 0000000..c37f2c6 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.5/polars/series/series.pyi @@ -0,0 +1,5172 @@ +#: version 0.20.5 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Enum as Enum, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Null as Null, Object as Object, String as String, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, hvplot as hvplot +from polars.exceptions import ModuleUpgradeRequired as ModuleUpgradeRequired, ShapeError as ShapeError +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.various import _is_generator as _is_generator, no_default as no_default, parse_percentiles as parse_percentiles, parse_version as parse_version, range_to_series as range_to_series, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor, warn_null_comparison as warn_null_comparison +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, Iterable, Literal, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +_HVPLOT_AVAILABLE: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_buffer_info(self) -> BufferInfo: + """ + Return pointer, offset, and length information about the underlying buffer. + + Returns + ------- + tuple of ints + Tuple of the form (pointer, offset, length) + + Raises + ------ + ComputeError + If the `Series` contains multiple chunks. + """ + def _get_buffer(self, index: Literal[0, 1, 2]) -> Self | None: + """ + Return the underlying data, validity, or offsets buffer as a Series. + + The data buffer always exists. + The validity buffer may not exist if the column contains no null values. + The offsets buffer only exists for Series of data type `String` and `List`. + + Parameters + ---------- + index + An index indicating the buffer to return: + + - `0` -> data buffer + - `1` -> validity buffer + - `2` -> offsets buffer + + Returns + ------- + Series or None + `Series` if the specified buffer exists, `None` otherwise. + + Raises + ------ + ComputeError + If the `Series` contains multiple chunks. + """ + def _from_buffer(self, dtype: PolarsDataType, buffer_info: BufferInfo, owner: Any) -> Self: + """ + Construct a Series from information about its underlying buffer. + + Parameters + ---------- + dtype + The data type of the buffer. + buffer_info + Tuple containing buffer information in the form `(pointer, offset, length)`. + owner + The object owning the buffer. + + Returns + ------- + Series + """ + def _from_buffers(self, dtype: PolarsDataType, data: Series | Sequence[Series], validity: Series | None = ...) -> Self: + """ + Construct a Series from information about its underlying buffers. + + Parameters + ---------- + dtype + The data type of the resulting Series. + data + Buffers describing the data. For most data types, this is a single Series of + the physical data type of `dtype`. Some data types require multiple buffers: + + - `String`: A data buffer of type `UInt8` and an offsets buffer + of type `Int64`. + validity + Validity buffer. If specified, must be a Series of data type `Boolean`. + + Returns + ------- + Series + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series <= other`.""" + def lt(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series < other`.""" + def eq(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series == other`.""" + def eq_missing(self, other: Any) -> Series | Expr: + ''' + Method equivalent of equality operator `series == other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + ''' + def ne(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series != other`.""" + def ne_missing(self, other: Any) -> Series | Expr: + ''' + Method equivalent of equality operator `series != other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + ''' + def ge(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series >= other`.""" + def gt(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series > other`.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + ''' + Return the Series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to `s[0]`, with a check + that the shape is (1,). With an index, this is equivalent to `s[index]`. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cum_sum().item(-1) + 24 + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.sqrt() + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.414214 + 1.732051 + ] + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.cbrt() + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.259921 + 1.44225 + ] + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + """ + def log(self, base: float = ...) -> Series: + """ + Compute the logarithm to a given base. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.log() + shape: (3,) + Series: '' [f64] + [ + 0.0 + 0.693147 + 1.098612 + ] + """ + def log1p(self) -> Series: + """ + Compute the natural logarithm of the input array plus one, element-wise. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.log1p() + shape: (3,) + Series: '' [f64] + [ + 0.693147 + 1.098612 + 1.386294 + ] + """ + def log10(self) -> Series: + """ + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> s = pl.Series([10, 100, 1000]) + >>> s.log10() + shape: (3,) + Series: '' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + """ + def exp(self) -> Series: + """ + Compute the exponential, element-wise. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.exp() + shape: (3,) + Series: '' [f64] + [ + 2.718282 + 7.389056 + 20.085537 + ] + """ + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Quick summary statistics of a Series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + Series has a numeric dtype). All values must be in the range `[0, 1]`. + + Notes + ----- + The median is included by default as the 50% percentile. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> s = pl.Series([1, 2, 3, 4, 5]) + >>> s.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + Non-numeric data types may not have all statistics available. + + >>> s = pl.Series(["a", "a", None, "b", "c"]) + >>> s.describe() + shape: (3, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════════╪═══════╡ + │ count ┆ 4 │ + │ null_count ┆ 1 │ + │ unique ┆ 4 │ + └────────────┴───────┘ + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + ''' + def mean(self) -> PythonLiteral | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + ''' + def product(self) -> int | float: + ''' + Reduce this Series to the product value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.product() + 6 + ''' + def pow(self, exponent: int | float | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4]) + >>> s.nan_max() + 4 + + >>> s = pl.Series("a", [1, float("nan"), 4]) + >>> s.nan_max() + nan + ''' + def nan_min(self) -> int | float | date | datetime | timedelta | str: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4]) + >>> s.nan_min() + 1 + + >>> s = pl.Series("a", [1, float("nan"), 4]) + >>> s.nan_min() + nan + ''' + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + ''' + def median(self) -> PythonLiteral | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + ''' + def rle(self) -> Series: + ''' + Get the lengths and values of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + See Also + -------- + rle_id + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Get a distinct integer ID for each run of identical values. + + The ID increases by one each time the value of a column (which can be a + :class:`Struct`) changes. + + This is especially useful when you want to define a new group for every time a + column\'s value changes, rather than for every distinct value of that column. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + include_breakpoint + Include a column that indicates the upper breakpoint. + include_category + Include a column that shows the intervals as categories. + + Returns + ------- + DataFrame + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬───────┐ + │ break_point ┆ category ┆ count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═══════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴───────┘ + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬───────┐ + │ color ┆ count │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═══════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴───────┘ + + Sort the output by count. + + >>> s.value_counts(sort=True) + shape: (3, 2) + ┌───────┬───────┐ + │ color ┆ count │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═══════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴───────┘ + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This functionality is experimental and may change without it being considered a + breaking change. + + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + ''' + def cum_max(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cum_max() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + ''' + def cum_min(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cum_min() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + ''' + def cum_prod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_prod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + ''' + def cum_sum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_sum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + ''' + def cum_count(self) -> Self: + ''' + Return the cumulative count of the non-null values in the column. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> s = pl.Series(["x", "k", None, "d"]) + >>> s.cum_count() + shape: (4,) + Series: \'\' [u32] + [ + 1 + 2 + 2 + 3 + ] + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + The resulting series will consist of multiple chunks. + + Parameters + ---------- + other + Series to append. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from `append`, which adds the chunks from `other` to the chunks of + this series, `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer `append` over `extend` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single `Series`. In the latter case, finish the sequence + of `append` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + head + """ + def gather_every(self, n: int, offset: int = ...) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Start the row index at this offset. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + >>> s.gather_every(2, offset=1) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + ''' + def null_count(self) -> int: + """ + Count the null values in this Series. + + Examples + -------- + >>> s = pl.Series([1, None, None]) + >>> s.null_count() + 2 + """ + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no `null` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have `null` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be `false`. + + To confirm that a column has `null` values use :func:`null_count`. + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + Examples + -------- + >>> s = pl.Series([1, 3, 2]) + >>> s.is_sorted() + False + + >>> s = pl.Series([3, 2, 1]) + >>> s.is_sorted(descending=True) + True + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + """ + def equals(self, other: Series) -> bool: + ''' + Check whether the Series is equal to another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + See Also + -------- + assert_series_equal + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s1.equals(s1) + True + >>> s1.equals(s2) + False + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that are between the given lower/upper bounds. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point `nan` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set `zero_copy_only=True`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + ''' + def _view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + + Returns + ------- + SeriesView + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s._view(ignore_nulls=True) + SeriesView([1, 0]) + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + ''' + def to_pandas(self, *args: Any, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This requires that :mod:`pandas` and :mod:`pyarrow` are installed. + This operation clones data, unless `use_pyarrow_extension_array=True`. + + Parameters + ---------- + use_pyarrow_extension_array + Further operations on this Pandas series, might trigger conversion to numpy. + Use PyArrow backed-extension array instead of numpy array for pandas + Series. This allows zero copy operations and preservation of nulls + values. + Further operations on this pandas Series, might trigger conversion + to NumPy arrays if that operation is not supported by pyarrow compute + functions. + kwargs + Arguments will be sent to :meth:`pyarrow.Table.to_pandas`. + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s1.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + >>> s1.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64[pyarrow] + >>> s2 = pl.Series("b", [1, 2, None, 4]) + >>> s2.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + 3 4.0 + Name: b, dtype: float64 + >>> s2.to_pandas(use_pyarrow_extension_array=True) # doctest: +SKIP + 0 1 + 1 2 + 2 + 3 4 + Name: b, dtype: int64[pyarrow] + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + ''' + def count(self) -> int: + ''' + Return the number of non-null elements in the column. + + See Also + -------- + len + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.count() + 2 + ''' + def len(self) -> int: + ''' + Return the number of elements in the Series. + + Null values count towards the total. + + See Also + -------- + count + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.len() + 3 + ''' + def set(self, filter: Series, value: int | float | str | bool | None) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + ''' + def scatter(self, indices: Series | Iterable[int] | int | np.ndarray[Any, Any], values: Series | Iterable[PythonLiteral] | PythonLiteral | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimization (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.scatter(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_index().select( + ... pl.when(pl.col("index") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + ''' + def clone(self) -> Self: + ''' + Create a copy of this Series. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + ''' + def round_sig_figs(self, digits: int) -> Series: + """ + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> s = pl.Series([0.01234, 3.333, 1234.0]) + >>> s.round_sig_figs(2) + shape: (3,) + Series: '' [f64] + [ + 0.012 + 3.3 + 1200.0 + ] + """ + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + ''' + def cot(self) -> Series: + ''' + Compute the element-wise value for the cotangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cot() + shape: (3,) + Series: \'a\' [f64] + [ + inf + 6.1232e-17 + -8.1656e15 + ] + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + ''' + def shift(self, n: int = ...) -> Series: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> s = pl.Series([1, 2, 3, 4]) + >>> s.shift() + shape: (4,) + Series: '' [i64] + [ + null + 1 + 2 + 3 + ] + + Pass a negative value to shift in the opposite direction instead. + + >>> s.shift(-2) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + null + null + ] + + Specify `fill_value` to fill the resulting null values. + + >>> s.shift(-2, fill_value=100) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + 100 + 100 + ] + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their std dev. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window. + + Warnings + -------- + + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + ] + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + + Examples + -------- + >>> s = pl.Series([1, -2, -3]) + >>> s.abs() + shape: (3,) + Series: '' [i64] + [ + 1 + 2 + 3 + ] + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> s = pl.Series([1, 2, 2, 4, 5]) + >>> s.skew() + 0.34776706224699483 + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: + """ + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no lower bound is applied. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no upper bound is applied. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> s = pl.Series([-50, 5, 50, None]) + >>> s.clip(1, 10) + shape: (4,) + Series: '' [i64] + [ + 1 + 5 + 10 + null + ] + + Specifying only a single bound: + + >>> s.clip(upper_bound=10) + shape: (4,) + Series: '' [i64] + [ + -50 + 5 + 10 + null + ] + """ + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + ''' + def replace(self, old: IntoExpr | Sequence[Any] | Mapping[Any, Any], new: IntoExpr | Sequence[Any] | NoDefault = ...) -> Self: + ''' + Replace values by different values. + + Parameters + ---------- + old + Value or sequence of values to replace. + Also accepts a mapping of values to their replacement as syntactic sugar for + `replace(new=Series(mapping.keys()), old=Series(mapping.values()))`. + new + Value or sequence of values to replace by. + Length must match the length of `old` or have length 1. + default + Set values that were not replaced to this value. + Defaults to keeping the original value. + Accepts expression input. Non-expression inputs are parsed as literals. + return_dtype + The data type of the resulting Series. If set to `None` (default), + the data type is determined automatically based on the other inputs. + + See Also + -------- + str.replace + + Notes + ----- + The global string cache must be enabled when replacing categorical values. + + Examples + -------- + Replace a single value by another value. Values that were not replaced remain + unchanged. + + >>> s = pl.Series([1, 2, 2, 3]) + >>> s.replace(2, 100) + shape: (4,) + Series: \'\' [i64] + [ + 1 + 100 + 100 + 3 + ] + + Replace multiple values by passing sequences to the `old` and `new` parameters. + + >>> s.replace([2, 3], [100, 200]) + shape: (4,) + Series: \'\' [i64] + [ + 1 + 100 + 100 + 200 + ] + + Passing a mapping with replacements is also supported as syntactic sugar. + Specify a default to set all values that were not matched. + + >>> mapping = {2: 100, 3: 200} + >>> s.replace(mapping, default=-1) + shape: (4,) + Series: \'\' [i64] + [ + -1 + 100 + 100 + 200 + ] + + + The default can be another Series. + + >>> default = pl.Series([2.5, 5.0, 7.5, 10.0]) + >>> s.replace(2, 100, default=default) + shape: (4,) + Series: \'\' [f64] + [ + 2.5 + 100.0 + 100.0 + 10.0 + ] + + Replacing by values of a different data type sets the return type based on + a combination of the `new` data type and either the original data type or the + default data type if it was set. + + >>> s = pl.Series(["x", "y", "z"]) + >>> mapping = {"x": 1, "y": 2, "z": 3} + >>> s.replace(mapping) + shape: (3,) + Series: \'\' [str] + [ + "1" + "2" + "3" + ] + >>> s.replace(mapping, default=None) + shape: (3,) + Series: \'\' [i64] + [ + 1 + 2 + 3 + ] + + Set the `return_dtype` parameter to control the resulting data type directly. + + >>> s.replace(mapping, return_dtype=pl.UInt8) + shape: (3,) + Series: \'\' [u8] + [ + 1 + 2 + 3 + ] + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.ewm_mean(com=1) + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.666667 + 2.428571 + ] + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.implode() + shape: (1,) + Series: \'a\' [list[i64]] + [ + [1, 2, 3] + ] + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + """ + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() # doctest: +SKIP + True + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_integer()` instead. + For signed/unsigned variants, use `Series.dtype.is_signed_integer()` + or `Series.dtype.is_unsigned_integer()`. + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() # doctest: +SKIP + True + >>> s.is_integer(signed=False) # doctest: +SKIP + True + >>> s.is_integer(signed=True) # doctest: +SKIP + False + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_numeric()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() # doctest: +SKIP + True + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_temporal()` instead. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() # doctest: +SKIP + True + >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP + False + """ + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Boolean` instead. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() # doctest: +SKIP + True + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a String. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.String` instead. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() # doctest: +SKIP + True + ''' + def take_every(self, n: int, offset: int = ...) -> Series: + """ + Take every nth value in the Series and return as new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + Index location used for selection. + """ + def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + """ + Set values at the index locations. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`scatter`. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + """ + def cumsum(self) -> Series: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummax(self) -> Series: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummin(self) -> Series: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cumprod(self) -> Series: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def view(self) -> SeriesView: + """ + Get a view into this Series data with a numpy array. + + .. deprecated:: 0.19.14 + This method will be removed in a future version. + + This operation doesn't clone data, but does not include missing values. + Don't use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in the Series using a remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + """ + def series_equal(self, other: Series) -> bool: + """ + Check whether the Series is equal to another Series. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`equals`. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... + @property + def plot(self): ... +def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: + """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.6/polars/dataframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.6/polars/dataframe/frame.pyi new file mode 100644 index 0000000..f62e6be --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.6/polars/dataframe/frame.pyi @@ -0,0 +1,7139 @@ +#: version 0.20.6 +import P +import deltalake +import deltalake.table +import np as np +import pa as pa +import pd as pd +from _io import BytesIO, TextIOWrapper + +from builtins import PyDataFrame +from pathlib import Path +from polars.dataframe._html import NotebookFormatter as NotebookFormatter +from polars.dataframe.group_by import DynamicGroupBy as DynamicGroupBy, GroupBy as GroupBy, RollingGroupBy as RollingGroupBy +from polars.datatypes.classes import Boolean as Boolean, Float64 as Float64, Object as Object, String as String +from polars.datatypes.convert import py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, hvplot as hvplot, import_optional as import_optional +from polars.exceptions import ModuleUpgradeRequired as ModuleUpgradeRequired, NoRowsReturnedError as NoRowsReturnedError, TooManyRowsReturnedError as TooManyRowsReturnedError +from polars.functions.col import col as col +from polars.functions.lit import lit as lit +from polars.io._utils import _is_glob_pattern as _is_glob_pattern, _is_local_file as _is_local_file +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.spreadsheet._write_utils import _XLFormatCache as _XLFormatCache, _unpack_multi_column_dict as _unpack_multi_column_dict, _xl_apply_conditional_formats as _xl_apply_conditional_formats, _xl_inject_sparklines as _xl_inject_sparklines, _xl_setup_table_columns as _xl_setup_table_columns, _xl_setup_table_options as _xl_setup_table_options, _xl_setup_workbook as _xl_setup_workbook, _xl_unique_table_name as _xl_unique_table_name +from polars.selectors import _expand_selector_dicts as _expand_selector_dicts, _expand_selectors as _expand_selectors +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import _post_apply_columns as _post_apply_columns, arrow_to_pydf as arrow_to_pydf, dict_to_pydf as dict_to_pydf, frame_to_pydf as frame_to_pydf, iterable_to_pydf as iterable_to_pydf, numpy_to_idxs as numpy_to_idxs, numpy_to_pydf as numpy_to_pydf, pandas_to_pydf as pandas_to_pydf, sequence_to_pydf as sequence_to_pydf, series_to_pydf as series_to_pydf +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression +from polars.utils._wrap import wrap_expr as wrap_expr, wrap_ldf as wrap_ldf, wrap_s as wrap_s +from polars.utils.convert import _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_parameter_as_positional as deprecate_parameter_as_positional, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.unstable import issue_unstable_warning as issue_unstable_warning, unstable as unstable +from polars.utils.various import _prepare_row_index_args as _prepare_row_index_args, _process_null_values as _process_null_values, handle_projection_columns as handle_projection_columns, is_bool_sequence as is_bool_sequence, is_int_sequence as is_int_sequence, is_str_sequence as is_str_sequence, normalize_filepath as normalize_filepath, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes, warn_null_comparison as warn_null_comparison +from typing import Any, BinaryIO, Callable, ClassVar as _ClassVar, Collection, IO, Iterable, Iterator, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +INTEGER_DTYPES: frozenset +N_INFER_DEFAULT: int +_HVPLOT_AVAILABLE: bool +_PANDAS_AVAILABLE: bool +_PYARROW_AVAILABLE: bool +_dtype_str_repr: builtin_function_or_method + +class DataFrame: + _accessors: _ClassVar[set] = ... + columns: list[str] + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pydf(cls, py_df: PyDataFrame) -> Self: + """Construct Polars DataFrame from FFI PyDataFrame object.""" + @classmethod + def _from_dicts(cls, data: Sequence[dict[str, Any]], schema: SchemaDefinition | None = ...) -> Self: ... + @classmethod + def _from_dict(cls, data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a dictionary of sequences. + + Parameters + ---------- + data : dict of sequences + Two-dimensional data represented as a dictionary. dict must contain + Sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + """ + @classmethod + def _from_records(cls, data: Sequence[Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a sequence of sequences. + + Parameters + ---------- + data : Sequence of sequences + Two-dimensional data represented as a sequence of sequences. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + infer_schema_length + How many rows to scan to determine the column type. + """ + @classmethod + def _from_numpy(cls, data: np.ndarray[Any, Any], schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from a numpy ndarray. + + Parameters + ---------- + data : numpy ndarray + Two-dimensional data represented as a numpy ndarray. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + orient : {'col', 'row'}, default None + Whether to interpret two-dimensional data as columns or as rows. If None, + the orientation is inferred by matching the columns and data dimensions. If + this does not yield conclusive results, column orientation is used. + """ + @classmethod + def _from_arrow(cls, data: pa.Table, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a DataFrame from an Arrow table. + + This operation will be zero copy for the most part. Types that are not + supported by Polars may be cast to the closest supported type. + + Parameters + ---------- + data : arrow table, array, or sequence of sequences + Data representing an Arrow Table or Array. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + """ + @classmethod + def _from_pandas(cls, data: pd.DataFrame, schema: SchemaDefinition | None = ...) -> Self: + """ + Construct a Polars DataFrame from a pandas DataFrame. + + Parameters + ---------- + data : pandas DataFrame + Two-dimensional data represented as a pandas DataFrame. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. + schema_overrides : dict, default None + Support type specification or override of one or more columns; note that + any dtypes inferred from the columns param will be overridden. + rechunk : bool, default True + Make sure that all data is in contiguous memory. + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. + include_index : bool, default False + Load any non-default pandas indexes as columns. + """ + @classmethod + def _read_csv(cls, source: str | Path | IO[bytes] | bytes) -> DataFrame: + """ + Read a CSV file into a DataFrame. + + Use `pl.read_csv` to dispatch to this method. + + See Also + -------- + polars.io.read_csv + """ + @classmethod + def _read_parquet(cls, source: str | Path | IO[bytes] | bytes) -> DataFrame: + """ + Read into a DataFrame from a parquet file. + + Use `pl.read_parquet` to dispatch to this method. + + See Also + -------- + polars.io.read_parquet + """ + @classmethod + def _read_avro(cls, source: str | Path | BinaryIO | bytes) -> Self: + """ + Read into a DataFrame from Apache Avro format. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns. + n_rows + Stop reading from Apache Avro file after reading `n_rows`. + """ + @classmethod + def _read_ipc(cls, source: str | Path | IO[bytes] | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC file format. + + See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html. + Arrow IPC files are also known as Feather (v2) files. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC file after reading `n_rows`. + row_index_name + Row index name. + row_index_offset + Row index offset. + rechunk + Make sure that all data is contiguous. + memory_map + Memory map the file + ''' + @classmethod + def _read_ipc_stream(cls, source: str | Path | IO[bytes] | bytes) -> Self: + ''' + Read into a DataFrame from Arrow IPC record batch stream format. + + See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + columns + Columns to select. Accepts a list of column indices (starting at zero) or a + list of column names. + n_rows + Stop reading from IPC stream after reading `n_rows`. + row_index_name + Row index name. + row_index_offset + Row index offset. + rechunk + Make sure that all data is contiguous. + ''' + @classmethod + def _read_json(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a JSON file. + + Use `pl.read_json` to dispatch to this method. + + See Also + -------- + polars.io.read_json + """ + @classmethod + def _read_ndjson(cls, source: str | Path | IOBase | bytes) -> Self: + """ + Read into a DataFrame from a newline delimited JSON file. + + Use `pl.read_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.read_ndjson + """ + def _replace(self, column: str, new_column: Series) -> Self: + """Replace a column by a new Series (in place).""" + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.DataFrame(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __dataframe__(self, nan_as_null: bool = ..., allow_copy: bool = ...) -> PolarsDataFrame: + ''' + Convert to a dataframe object implementing the dataframe interchange protocol. + + Parameters + ---------- + nan_as_null + Overwrite null values in the data with `NaN`. + + .. warning:: + This functionality has not been implemented and the parameter will be + removed in a future version. + Setting this to `True` will raise a `NotImplementedError`. + allow_copy + Allow memory to be copied to perform the conversion. If set to `False`, + causes conversions that are not zero-copy to fail. + + Notes + ----- + Details on the Python dataframe interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + Convert a Polars DataFrame to a generic dataframe object and access some + properties. + + >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]}) + >>> dfi = df.__dataframe__() + >>> dfi.num_rows() + 2 + >>> dfi.get_column(1).dtype + (, 64, \'g\', \'=\') + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another object.""" + def _compare_to_other_df(self, other: DataFrame, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with another DataFrame.""" + def _compare_to_non_df(self, other: Any, op: ComparisonOperator) -> DataFrame: + """Compare a DataFrame with a non-DataFrame object.""" + def _div(self, other: Any) -> DataFrame: ... + def _cast_all_from_to(self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType) -> DataFrame: ... + def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame: ... + def __bool__(self) -> NoReturn: ... + def __eq__(self, other: Any) -> DataFrame: ... + def __ne__(self, other: Any) -> DataFrame: ... + def __gt__(self, other: Any) -> DataFrame: ... + def __lt__(self, other: Any) -> DataFrame: ... + def __ge__(self, other: Any) -> DataFrame: ... + def __le__(self, other: Any) -> DataFrame: ... + def __mul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __rmul__(self, other: DataFrame | Series | int | float) -> Self: ... + def __add__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __radd__(self, other: DataFrame | Series | int | float | bool | str) -> DataFrame: ... + def __sub__(self, other: DataFrame | Series | int | float) -> Self: ... + def __mod__(self, other: DataFrame | Series | int | float) -> Self: ... + def __contains__(self, key: str) -> bool: ... + def __iter__(self) -> Iterator[Series]: ... + def __reversed__(self) -> Iterator[Series]: ... + def _pos_idx(self, idx: int, dim: int) -> int: ... + def _take_with_series(self, s: Series) -> DataFrame: ... + def __getitem__(self, item: str | int | np.ndarray[Any, Any] | MultiColSelector | tuple[int, MultiColSelector] | tuple[MultiRowSelector, MultiColSelector] | tuple[MultiRowSelector, int | str] | tuple[int, int | str]) -> DataFrame | Series: + """Get item. Does quite a lot. Read the comments.""" + def __setitem__(self, key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int], value: Any) -> None: ... + def __len__(self) -> int: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def _ipython_key_completions_(self) -> list[str]: ... + def _repr_html_(self, **kwargs: Any) -> str: + """ + Format output data in HTML for display in Jupyter Notebooks. + + Output rows and columns can be modified by setting the following ENVIRONMENT + variables: + + * POLARS_FMT_MAX_COLS: set the number of columns + * POLARS_FMT_MAX_ROWS: set the number of rows + """ + def item(self, row: int | None = ..., column: int | str | None = ...) -> Any: + ''' + Return the DataFrame as a scalar, or return the element at the given row/column. + + Parameters + ---------- + row + Optional row index. + column + Optional column index or name. + + See Also + -------- + row: Get the values of a single row, either by index or by predicate. + + Notes + ----- + If row/col not provided, this is equivalent to `df[0,0]`, with a check that + the shape is (1,1). With row/col, this is equivalent to `df[row,col]`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.select((pl.col("a") * pl.col("b")).sum()).item() + 32 + >>> df.item(1, 1) + 5 + >>> df.item(2, "b") + 6 + ''' + def to_arrow(self) -> pa.Table: + ''' + Collect the underlying arrow arrays in an Arrow Table. + + This operation is mostly zero copy. + + Data types that do copy: + - CategoricalType + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]} + ... ) + >>> df.to_arrow() + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3,4,5,6]] + bar: [["a","b","c","d","e","f"]] + ''' + def to_dict(self) -> dict[str, Series] | dict[str, list[Any]]: + ''' + Convert DataFrame to a dictionary mapping column name to values. + + Parameters + ---------- + as_series + True -> Values are Series + False -> Values are List[Any] + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... "optional": [28, 300, None, 2, -30], + ... } + ... ) + >>> df + shape: (5, 5) + ┌─────┬────────┬─────┬────────┬──────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ optional │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╪════════╪══════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │ + └─────┴────────┴─────┴────────┴──────────┘ + >>> df.to_dict(as_series=False) + {\'A\': [1, 2, 3, 4, 5], + \'fruits\': [\'banana\', \'banana\', \'apple\', \'apple\', \'banana\'], + \'B\': [5, 4, 3, 2, 1], + \'cars\': [\'beetle\', \'audi\', \'beetle\', \'beetle\', \'beetle\'], + \'optional\': [28, 300, None, 2, -30]} + >>> df.to_dict(as_series=True) + {\'A\': shape: (5,) + Series: \'A\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ], \'fruits\': shape: (5,) + Series: \'fruits\' [str] + [ + "banana" + "banana" + "apple" + "apple" + "banana" + ], \'B\': shape: (5,) + Series: \'B\' [i64] + [ + 5 + 4 + 3 + 2 + 1 + ], \'cars\': shape: (5,) + Series: \'cars\' [str] + [ + "beetle" + "audi" + "beetle" + "beetle" + "beetle" + ], \'optional\': shape: (5,) + Series: \'optional\' [i64] + [ + 28 + 300 + null + 2 + -30 + ]} + ''' + def to_dicts(self) -> list[dict[str, Any]]: + ''' + Convert every row to a dictionary of Python-native values. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.to_dicts() + [{\'foo\': 1, \'bar\': 4}, {\'foo\': 2, \'bar\': 5}, {\'foo\': 3, \'bar\': 6}] + ''' + def to_numpy(self) -> np.ndarray[Any, Any]: + ''' + Convert DataFrame to a 2D NumPy array. + + This operation clones data. + + Parameters + ---------- + structured + Optionally return a structured array, with field names and + dtypes that correspond to the DataFrame schema. + order + The index order of the returned NumPy array, either C-like or + Fortran-like. In general, using the Fortran-like index order is faster. + However, the C-like order might be more appropriate to use for downstream + applications to prevent cloning data, e.g. when reshaping into a + one-dimensional array. Note that this option only takes effect if + `structured` is set to `False` and the DataFrame dtypes allow for a + global dtype for all columns. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + function for the conversion to numpy if necessary. + + Notes + ----- + If you\'re attempting to convert String or Decimal to an array, you\'ll need to + install `pyarrow`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.5, 7.0, 8.5], + ... "ham": ["a", "b", "c"], + ... }, + ... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32}, + ... ) + + Export to a standard 2D numpy array. + + >>> df.to_numpy() + array([[1, 6.5, \'a\'], + [2, 7.0, \'b\'], + [3, 8.5, \'c\']], dtype=object) + + Export to a structured array, which can better-preserve individual + column data, such as name and dtype... + + >>> df.to_numpy(structured=True) + array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \'>> import numpy as np + >>> df.to_numpy(structured=True).view(np.recarray) + rec.array([(1, 6.5, \'a\'), (2, 7. , \'b\'), (3, 8.5, \'c\')], + dtype=[(\'foo\', \'u1\'), (\'bar\', \' pd.DataFrame: + ''' + Convert this DataFrame to a pandas DataFrame. + + This operation copies data if `use_pyarrow_extension_array` is not enabled. + + Parameters + ---------- + use_pyarrow_extension_array + Use PyArrow-backed extension arrays instead of NumPy arrays for the columns + of the pandas DataFrame. This allows zero copy operations and preservation + of null values. Subsequent operations on the resulting pandas DataFrame may + trigger conversion to NumPy if those operations are not supported by PyArrow + compute functions. + **kwargs + Additional keyword arguments to be passed to + :meth:`pyarrow.Table.to_pandas`. + + Returns + ------- + :class:`pandas.DataFrame` + + Notes + ----- + This operation requires that both :mod:`pandas` and :mod:`pyarrow` are + installed. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_pandas() + foo bar ham + 0 1 6.0 a + 1 2 7.0 b + 2 3 8.0 c + + Null values in numeric columns are converted to `NaN`. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, None], + ... "bar": [6.0, None, 8.0], + ... "ham": [None, "b", "c"], + ... } + ... ) + >>> df.to_pandas() + foo bar ham + 0 1.0 6.0 None + 1 2.0 NaN b + 2 NaN 8.0 c + + Pass `use_pyarrow_extension_array=True` to get a pandas DataFrame with columns + backed by PyArrow extension arrays. This will preserve null values. + + >>> df.to_pandas(use_pyarrow_extension_array=True) + foo bar ham + 0 1 6.0 + 1 2 b + 2 8.0 c + >>> _.dtypes + foo int64[pyarrow] + bar double[pyarrow] + ham large_string[pyarrow] + dtype: object + ''' + def _to_pandas_with_object_columns(self, **kwargs: Any) -> pd.DataFrame: ... + def _to_pandas_without_object_columns(self, df: DataFrame, **kwargs: Any) -> pd.DataFrame: ... + def to_series(self, index: int = ...) -> Series: + ''' + Select column as Series at index location. + + Parameters + ---------- + index + Location of selection. + + See Also + -------- + get_column + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.to_series(1) + shape: (3,) + Series: \'bar\' [i64] + [ + 6 + 7 + 8 + ] + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert DataFrame to instantiatable string representation. + + Parameters + ---------- + n + Only use first n rows. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> df = pl.DataFrame( + ... [ + ... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + ... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + ... pl.Series("ham", ["a", "b", "c"], dtype=pl.String), + ... ] + ... ) + >>> print(df.to_init_repr()) + pl.DataFrame( + [ + pl.Series("foo", [1, 2, 3], dtype=pl.UInt8), + pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32), + pl.Series("ham", [\'a\', \'b\', \'c\'], dtype=pl.String), + ] + ) + + >>> df_from_str_repr = eval(df.to_init_repr()) + >>> df_from_str_repr + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ f32 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + ''' + def write_json(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + pretty + Pretty serialize json. + row_oriented + Write to row oriented json. This is slower, but more common. + + See Also + -------- + DataFrame.write_ndjson + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_json() + \'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}\' + >>> df.write_json(row_oriented=True) + \'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]\' + ''' + def write_ndjson(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize to newline delimited JSON representation. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... } + ... ) + >>> df.write_ndjson() + \'{"foo":1,"bar":6}\\n{"foo":2,"bar":7}\\n{"foo":3,"bar":8}\\n\' + ''' + def write_csv(self, file: BytesIO | TextIOWrapper | str | Path | None = ...) -> str | None: + ''' + Write to comma-separated values (CSV) file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + If set to `None` (default), the output is returned as a string instead. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + separator or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.csv" + >>> df.write_csv(path, separator=",") + ''' + def write_avro(self, file: BinaryIO | BytesIO | str | Path, compression: AvroCompression = ..., name: str = ...) -> None: + ''' + Write to Apache Avro file. + + Parameters + ---------- + file + File path or writeable file-like object to which the data will be written. + compression : {\'uncompressed\', \'snappy\', \'deflate\'} + Compression method. Defaults to "uncompressed". + name + Schema name. Defaults to empty string. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.avro" + >>> df.write_avro(path) + ''' + def write_excel(self, workbook: Workbook | BytesIO | Path | str | None = ..., worksheet: str | None = ...) -> Workbook: + ''' + Write frame data to a table in an Excel workbook/worksheet. + + Parameters + ---------- + workbook : Workbook + String name or path of the workbook to create, BytesIO object to write + into, or an open `xlsxwriter.Workbook` object that has not been closed. + If None, writes to a `dataframe.xlsx` workbook in the working directory. + worksheet : str + Name of target worksheet; if None, writes to "Sheet1" when creating a new + workbook (note that writing to an existing workbook requires a valid + existing -or new- worksheet name). + position : {str, tuple} + Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple. + table_style : {str, dict} + A named Excel table style, such as "Table Style Medium 4", or a dictionary + of `{"key":value,}` options containing one or more of the following keys: + "style", "first_column", "last_column", "banded_columns, "banded_rows". + table_name : str + Name of the output table object in the worksheet; can then be referred to + in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations. + column_formats : dict + A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an + Excel format string to the given columns. Formats defined here (such as + "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`. + dtype_formats : dict + A `{dtype:str,}` dictionary that sets the default Excel format for the + given dtype. (This can be overridden on a per-column basis by the + `column_formats` param). It is also valid to use dtype groups such as + `pl.FLOAT_DTYPES` as the dtype/format key, to simplify setting uniform + integer and float formats. + conditional_formats : dict + A dictionary of colname (or selector) keys to a format str, dict, or list + that defines conditional formatting options for the specified columns. + + * If supplying a string typename, should be one of the valid `xlsxwriter` + types such as "3_color_scale", "data_bar", etc. + * If supplying a dictionary you can make use of any/all `xlsxwriter` + supported options, including icon sets, formulae, etc. + * Supplying multiple columns as a tuple/key will apply a single format + across all columns - this is effective in creating a heatmap, as the + min/max values will be determined across the entire range, not per-column. + * Finally, you can also supply a list made up from the above options + in order to apply *more* than one conditional format to the same range. + header_format : dict + A `{key:value,}` dictionary of `xlsxwriter` format options to apply + to the table header row, such as `{"bold":True, "font_color":"#702963"}`. + column_totals : {bool, list, dict} + Add a column-total row to the exported table. + + * If True, all numeric columns will have an associated total using "sum". + * If passing a string, it must be one of the valid total function names + and all numeric columns will have an associated total using that function. + * If passing a list of colnames, only those given will have a total. + * For more control, pass a `{colname:funcname,}` dict. + + Valid total function names are "average", "count_nums", "count", "max", + "min", "std_dev", "sum", and "var". + column_widths : {dict, int} + A `{colname:int,}` or `{selector:int,}` dict or a single integer that + sets (or overrides if autofitting) table column widths, in integer pixel + units. If given as an integer the same value is used for all table columns. + row_totals : {dict, bool} + Add a row-total column to the right-hand side of the exported table. + + * If True, a column called "total" will be added at the end of the table + that applies a "sum" function row-wise across all numeric columns. + * If passing a list/sequence of column names, only the matching columns + will participate in the sum. + * Can also pass a `{colname:columns,}` dictionary to create one or + more total columns with distinct names, referencing different columns. + row_heights : {dict, int} + An int or `{row_index:int,}` dictionary that sets the height of the given + rows (if providing a dictionary) or all rows (if providing an integer) that + intersect with the table body (including any header and total row) in + integer pixel units. Note that `row_index` starts at zero and will be + the header row (unless `include_header` is False). + sparklines : dict + A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more + sparklines to be written into a new column in the table. + + * If passing a list of colnames (used as the source of the sparkline data) + the default sparkline settings are used (eg: line chart with no markers). + * For more control an `xlsxwriter`-compliant options dict can be supplied, + in which case three additional polars-specific keys are available: + "columns", "insert_before", and "insert_after". These allow you to define + the source columns and position the sparkline(s) with respect to other + table columns. If no position directive is given, sparklines are added to + the end of the table (eg: to the far right) in the order they are given. + formulas : dict + A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or + more formulas to be written into a new column in the table. Note that you + are strongly advised to use structured references in your formulae wherever + possible to make it simple to reference columns by name. + + * If providing a string formula (such as "=[@colx]*[@coly]") the column will + be added to the end of the table (eg: to the far right), after any default + sparklines and before any row_totals. + * For the most control supply an options dictionary with the following keys: + "formula" (mandatory), one of "insert_before" or "insert_after", and + optionally "return_dtype". The latter is used to appropriately format the + output of the formula and allow it to participate in row/column totals. + float_precision : int + Default number of decimals displayed for floating point columns (note that + this is purely a formatting directive; the actual values are not rounded). + include_header : bool + Indicate if the table should be created with a header row. + autofilter : bool + If the table has headers, provide autofilter capability. + autofit : bool + Calculate individual column widths from the data. + hidden_columns : list + A list or selector representing table columns to hide in the worksheet. + hide_gridlines : bool + Do not display any gridlines on the output worksheet. + sheet_zoom : int + Set the default zoom level of the output worksheet. + freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int) + Freeze workbook panes. + + * If (row, col) is supplied, panes are split at the top-left corner of the + specified cell, which are 0-indexed. Thus, to freeze only the top row, + supply (1, 0). + * Alternatively, cell notation can be used to supply the cell. For example, + "A2" indicates the split occurs at the top-left of cell A2, which is the + equivalent of (1, 0). + * If (row, col, top_row, top_col) are supplied, the panes are split based on + the `row` and `col`, and the scrolling region is inititalized to begin at + the `top_row` and `top_col`. Thus, to freeze only the top row and have the + scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4). + Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent. + + Notes + ----- + * A list of compatible `xlsxwriter` format property names can be found here: + https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties + + * Conditional formatting dictionaries should provide xlsxwriter-compatible + definitions; polars will take care of how they are applied on the worksheet + with respect to the relative sheet/column position. For supported options, + see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html + + * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible + key/values, as well as a mandatory polars "columns" key that defines the + sparkline source data; these source columns should all be adjacent. Two other + polars-specific keys are available to help define where the sparkline appears + in the table: "insert_after", and "insert_before". The value associated with + these keys should be the name of a column in the exported table. + https://xlsxwriter.readthedocs.io/working_with_sparklines.html + + * Formula dictionaries *must* contain a key called "formula", and then optional + "insert_after", "insert_before", and/or "return_dtype" keys. These additional + keys allow the column to be injected into the table at a specific location, + and/or to define the return type of the formula (eg: "Int64", "Float64", etc). + Formulas that refer to table columns should use Excel\'s structured references + syntax to ensure the formula is applied correctly and is table-relative. + https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e + + Examples + -------- + Instantiate a basic DataFrame: + + >>> from random import uniform + >>> from datetime import date + >>> + >>> df = pl.DataFrame( + ... { + ... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)], + ... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)], + ... "val": [10_000, 20_000, 30_000], + ... } + ... ) + + Export to "dataframe.xlsx" (the default workbook name, if not specified) in the + working directory, add column totals ("sum" by default) on all numeric columns, + then autofit: + + >>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP + + Write frame to a specific location on the sheet, set a named table style, + apply US-style date formatting, increase default float precision, apply a + non-default total function to a single column, autofit: + + >>> df.write_excel( # doctest: +SKIP + ... position="B4", + ... table_style="Table Style Light 16", + ... dtype_formats={pl.Date: "mm/dd/yyyy"}, + ... column_totals={"num": "average"}, + ... float_precision=6, + ... autofit=True, + ... ) + + Write the same frame to a named worksheet twice, applying different styles + and conditional formatting to each table, adding table titles using explicit + xlsxwriter integration: + + >>> from xlsxwriter import Workbook + >>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP + ... # basic/default conditional formatting + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(3, 1), # specify position as (row,col) coordinates + ... conditional_formats={"num": "3_color_scale", "val": "data_bar"}, + ... table_style="Table Style Medium 4", + ... ) + ... + ... # advanced conditional formatting, custom styles + ... df.write_excel( + ... workbook=wb, + ... worksheet="data", + ... position=(len(df) + 7, 1), + ... table_style={ + ... "style": "Table Style Light 4", + ... "first_column": True, + ... }, + ... conditional_formats={ + ... "num": { + ... "type": "3_color_scale", + ... "min_color": "#76933c", + ... "mid_color": "#c4d79b", + ... "max_color": "#ebf1de", + ... }, + ... "val": { + ... "type": "data_bar", + ... "data_bar_2010": True, + ... "bar_color": "#9bbb59", + ... "bar_negative_color_same": True, + ... "bar_negative_border_color_same": True, + ... }, + ... }, + ... column_formats={"num": "#,##0.000;[White]-#,##0.000"}, + ... column_widths={"val": 125}, + ... autofit=True, + ... ) + ... + ... # add some table titles (with a custom format) + ... ws = wb.get_worksheet_by_name("data") + ... fmt_title = wb.add_format( + ... { + ... "font_color": "#4f6228", + ... "font_size": 12, + ... "italic": True, + ... "bold": True, + ... } + ... ) + ... ws.write(2, 1, "Basic/default conditional formatting", fmt_title) + ... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title) + + Export a table containing two different types of sparklines. Use default + options for the "trend" sparkline and customised options (and positioning) + for the "+/-" win_loss sparkline, with non-default integer dtype formatting, + column totals, a subtle two-tone heatmap and hidden worksheet gridlines: + + >>> df = pl.DataFrame( + ... { + ... "id": ["aaa", "bbb", "ccc", "ddd", "eee"], + ... "q1": [100, 55, -20, 0, 35], + ... "q2": [30, -10, 15, 60, 20], + ... "q3": [-50, 0, 40, 80, 80], + ... "q4": [75, 55, 25, -10, -55], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style="Table Style Light 2", + ... # apply accounting format to all flavours of integer + ... dtype_formats={pl.INTEGER_DTYPES: "#,##0_);(#,##0)"}, + ... sparklines={ + ... # default options; just provide source cols + ... "trend": ["q1", "q2", "q3", "q4"], + ... # customised sparkline type, with positioning directive + ... "+/-": { + ... "columns": ["q1", "q2", "q3", "q4"], + ... "insert_after": "id", + ... "type": "win_loss", + ... }, + ... }, + ... conditional_formats={ + ... # create a unified multi-column heatmap + ... ("q1", "q2", "q3", "q4"): { + ... "type": "2_color_scale", + ... "min_color": "#95b3d7", + ... "max_color": "#ffffff", + ... }, + ... }, + ... column_totals=["q1", "q2", "q3", "q4"], + ... row_totals=True, + ... hide_gridlines=True, + ... ) + + Export a table containing an Excel formula-based column that calculates a + standardised Z-score, showing use of structured references in conjunction + with positioning directives, column totals, and custom formatting. + + >>> df = pl.DataFrame( + ... { + ... "id": ["a123", "b345", "c567", "d789", "e101"], + ... "points": [99, 45, 50, 85, 35], + ... } + ... ) + >>> df.write_excel( # doctest: +SKIP + ... table_style={ + ... "style": "Table Style Medium 15", + ... "first_column": True, + ... }, + ... column_formats={ + ... "id": {"font": "Consolas"}, + ... "points": {"align": "center"}, + ... "z-score": {"align": "center"}, + ... }, + ... column_totals="average", + ... formulas={ + ... "z-score": { + ... # use structured references to refer to the table columns and \'totals\' row + ... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))", + ... "insert_after": "points", + ... "return_dtype": pl.Float64, + ... } + ... }, + ... hide_gridlines=True, + ... sheet_zoom=125, + ... ) + ''' + def write_ipc(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC binary stream or Feather file. + + See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC data will be + written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + future + Setting this to `True` will write Polars\' internal data structures that + might not be available by other Arrow implementations. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc(path) + ''' + def write_ipc_stream(self, file: BinaryIO | BytesIO | str | Path | None, compression: IpcCompression = ...) -> BytesIO | None: + ''' + Write to Arrow IPC record batch stream. + + See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html. + + Parameters + ---------- + file + Path or writeable file-like object to which the IPC record batch data will + be written. If set to `None`, the output is returned as a BytesIO object. + compression : {\'uncompressed\', \'lz4\', \'zstd\'} + Compression method. Defaults to "uncompressed". + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.arrow" + >>> df.write_ipc_stream(path) + ''' + def write_parquet(self, file: str | Path | BytesIO) -> None: + ''' + Write to Apache Parquet file. + + Parameters + ---------- + file + File path or writeable file-like object to which the result will be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + data_page_size + Size of the data page in bytes. Defaults to 1024^2 bytes. + use_pyarrow + Use C++ parquet implementation vs Rust parquet implementation. + At the moment C++ supports more features. + pyarrow_options + Arguments passed to `pyarrow.parquet.write_table`. + + If you pass `partition_cols` here, the dataset will be written + using `pyarrow.parquet.write_to_dataset`. + The `partition_cols` parameter leads to write the dataset to a directory. + Similar to Spark\'s partitioned datasets. + + Examples + -------- + >>> import pathlib + >>> + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> path: pathlib.Path = dirpath / "new_file.parquet" + >>> df.write_parquet(path) + + We can use pyarrow with use_pyarrow_write_to_dataset=True + to write partitioned datasets. The following example will + write the first row to ../watermark=1/*.parquet and the + other rows to ../watermark=2/*.parquet. + + >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]}) + >>> path: pathlib.Path = dirpath / "partitioned_object" + >>> df.write_parquet( + ... path, + ... use_pyarrow=True, + ... pyarrow_options={"partition_cols": ["watermark"]}, + ... ) + ''' + def write_database(self, table_name: str, connection: str) -> int: + ''' + Write a polars frame to a database. + + Parameters + ---------- + table_name + Schema-qualified name of the table to create or append to in the target + SQL database. If your table name contains special characters, it should + be quoted. + connection + Connection URI string, for example: + + * "postgresql://user:pass@server:port/database" + * "sqlite:////path/to/database.db" + if_table_exists : {\'append\', \'replace\', \'fail\'} + The insert mode: + + * \'replace\' will create a new database table, overwriting an existing one. + * \'append\' will append to an existing table. + * \'fail\' will fail if table already exists. + engine : {\'sqlalchemy\', \'adbc\'} + Select the engine to use for writing frame data. + + Returns + ------- + int + The number of rows affected, if the driver provides this information. + Otherwise, returns -1. + ''' + def write_delta(self, target: str | Path | deltalake.DeltaTable) -> deltalake.table.TableMerger | None: + ''' + Write DataFrame as delta table. + + Parameters + ---------- + target + URI of a table or a DeltaTable object. + mode : {\'error\', \'append\', \'overwrite\', \'ignore\', \'merge\'} + How to handle existing data. + + - If \'error\', throw an error if the table already exists (default). + - If \'append\', will add new data. + - If \'overwrite\', will replace table with new data. + - If \'ignore\', will not write anything if table already exists. + - If \'merge\', return a `TableMerger` object to merge data from the DataFrame + with the existing data. + overwrite_schema + If True, allows updating the schema of the table. + storage_options + Extra options for the storage backends supported by `deltalake`. + For cloud storages, this may include configurations for authentication etc. + + - See a list of supported storage options for S3 `here `__. + - See a list of supported storage options for GCS `here `__. + - See a list of supported storage options for Azure `here `__. + delta_write_options + Additional keyword arguments while writing a Delta lake Table. + See a list of supported write options `here `__. + delta_merge_options + Keyword arguments which are required to `MERGE` a Delta lake Table. + See a list of supported merge options `here `__. + + Raises + ------ + TypeError + If the DataFrame contains unsupported data types. + ArrowInvalidError + If the DataFrame contains data types that could not be cast to their + primitive type. + TableNotFoundError + If the delta table doesn\'t exist and MERGE action is triggered + + Notes + ----- + The Polars data types :class:`Null`, :class:`Categorical` and :class:`Time` + are not supported by the delta protocol specification and will raise a + TypeError. + + Polars columns are always nullable. To write data to a delta table with + non-nullable columns, a custom pyarrow schema has to be passed to the + `delta_write_options`. See the last example below. + + Examples + -------- + Write a dataframe to the local filesystem as a Delta Lake table. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> df.write_delta(table_path) # doctest: +SKIP + + Append data to an existing Delta Lake table on the local filesystem. + Note that this will fail if the schema of the new data does not match the + schema of the existing table. + + >>> df.write_delta(table_path, mode="append") # doctest: +SKIP + + Overwrite a Delta Lake table as a new version. + If the schemas of the new and old data are the same, setting + `overwrite_schema` is not required. + + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, mode="overwrite", overwrite_schema=True + ... ) # doctest: +SKIP + + Write a DataFrame as a Delta Lake table to a cloud object store like S3. + + >>> table_path = "s3://bucket/prefix/to/delta-table/" + >>> df.write_delta( + ... table_path, + ... storage_options={ + ... "AWS_REGION": "THE_AWS_REGION", + ... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID", + ... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY", + ... }, + ... ) # doctest: +SKIP + + Write DataFrame as a Delta Lake table with non-nullable columns. + + >>> import pyarrow as pa + >>> existing_table_path = "/path/to/delta-table/" + >>> df.write_delta( + ... existing_table_path, + ... delta_write_options={ + ... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)]) + ... }, + ... ) # doctest: +SKIP + + Merge the DataFrame with an existing Delta Lake table. + For all `TableMerger` methods, check the deltalake docs + `here `__. + + Schema evolution is not yet supported in by the `deltalake` package, therefore + `overwrite_schema` will not have any effect on a merge operation. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> table_path = "/path/to/delta-table/" + >>> ( + ... df.write_delta( + ... "table_path", + ... mode="merge", + ... delta_merge_options={ + ... "predicate": "s.foo = t.foo", + ... "source_alias": "s", + ... "target_alias": "t", + ... }, + ... ) + ... .when_matched_update_all() + ... .when_not_matched_insert_all() + ... .execute() + ... ) # doctest: +SKIP + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": list(reversed(range(1_000_000))), + ... "y": [v / 1000 for v in range(1_000_000)], + ... "z": [str(v) for v in range(1_000_000)], + ... }, + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.String)], + ... ) + >>> df.estimated_size() + 28000000 + >>> df.estimated_size("mb") + 26.702880859375 + ''' + def transpose(self) -> Self: + ''' + Transpose a DataFrame over the diagonal. + + Parameters + ---------- + include_header + If set, the column names will be added as first column. + header_name + If `include_header` is set, this determines the name of the column that will + be inserted. + column_names + Optional iterable yielding strings or a string naming an existing column. + These will name the value (non-header) columns in the transposed data. + + Notes + ----- + This is a very expensive operation. Perhaps you can do it differently. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> df.transpose(include_header=True) + shape: (2, 4) + ┌────────┬──────────┬──────────┬──────────┐ + │ column ┆ column_0 ┆ column_1 ┆ column_2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪══════════╪══════════╪══════════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 4 ┆ 5 ┆ 6 │ + └────────┴──────────┴──────────┴──────────┘ + + Replace the auto-generated column names with a list + + >>> df.transpose(include_header=False, column_names=["x", "y", "z"]) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + + Include the header as a separate column + + >>> df.transpose( + ... include_header=True, header_name="foo", column_names=["x", "y", "z"] + ... ) + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ foo ┆ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┴─────┘ + + Replace the auto-generated column with column names from a generator function + + >>> def name_generator(): + ... base_name = "my_column_" + ... count = 0 + ... while True: + ... yield f"{base_name}{count}" + ... count += 1 + >>> df.transpose(include_header=False, column_names=name_generator()) + shape: (2, 3) + ┌─────────────┬─────────────┬─────────────┐ + │ my_column_0 ┆ my_column_1 ┆ my_column_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════════════╪═════════════╪═════════════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 4 ┆ 5 ┆ 6 │ + └─────────────┴─────────────┴─────────────┘ + + Use an existing column as the new column names + + >>> df = pl.DataFrame(dict(id=["i", "j", "k"], a=[1, 2, 3], b=[4, 5, 6])) + >>> df.transpose(column_names="id") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ i ┆ j ┆ k │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 3 │ + │ 4 ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + >>> df.transpose(include_header=True, header_name="new_id", column_names="id") + shape: (2, 4) + ┌────────┬─────┬─────┬─────┐ + │ new_id ┆ i ┆ j ┆ k │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 2 ┆ 3 │ + │ b ┆ 4 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┴─────┘ + ''' + def reverse(self) -> DataFrame: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> df.reverse() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + ''' + def rename(self, mapping: dict[str, str] | Callable[[str], str]) -> DataFrame: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name, or a function + that takes the old name as input and returns the new name. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.rename({"foo": "apple"}) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + >>> df.rename(lambda column_name: "c" + column_name[1:]) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ coo ┆ car ┆ cam │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + ''' + def insert_column(self, index: int, column: Series) -> Self: + ''' + Insert a Series at a certain column index. + + This operation is in place. + + Parameters + ---------- + index + Index at which to insert the new `Series` column. + column + `Series` to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series("baz", [97, 98, 99]) + >>> df.insert_column(1, s) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ baz ┆ bar │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 97 ┆ 4 │ + │ 2 ┆ 98 ┆ 5 │ + │ 3 ┆ 99 ┆ 6 │ + └─────┴─────┴─────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> s = pl.Series("d", [-2.5, 15, 20.5, 0]) + >>> df.insert_column(3, s) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ -2.5 │ + │ 2 ┆ 4.0 ┆ true ┆ 15.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 20.5 │ + │ 4 ┆ 13.0 ┆ true ┆ 0.0 │ + └─────┴──────┴───────┴──────┘ + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> DataFrame: + ''' + Filter the rows in the DataFrame based on one or more predicate expressions. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression(s) that evaluates to a boolean Series. + constraints + Column filters; use `name = value` to filter columns by the supplied value. + Each constraint will behave the same as `pl.col(name).eq(value)`, and + will be implicitly joined with the other filter conditions using `&`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> df.filter(pl.col("foo") > 1) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions, combined with and/or operators: + + >>> df.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + >>> df.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> df.filter( + ... pl.col("foo") <= 2, + ... ~pl.col("ham").is_in(["b", "c"]), + ... ) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> df.filter(foo=2, ham="b") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + ''' + def glimpse(self) -> str | None: + ''' + Return a dense preview of the DataFrame. + + The formatting shows one line per column so that wide dataframes display + cleanly. Each line shows the column name, the data type, and the first + few values. + + Parameters + ---------- + max_items_per_column + Maximum number of items to show per column. + max_colname_length + Maximum length of the displayed column names; values that exceed this + value are truncated with a trailing ellipsis. + return_as_string + If True, return the preview as a string instead of printing to stdout. + + See Also + -------- + describe, head, tail + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, 2.8, 3.0], + ... "b": [4, 5, None], + ... "c": [True, False, True], + ... "d": [None, "b", "c"], + ... "e": ["usd", "eur", None], + ... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)], + ... } + ... ) + >>> df.glimpse() + Rows: 3 + Columns: 6 + $ a 1.0, 2.8, 3.0 + $ b 4, 5, None + $ c True, False, True + $ d None, \'b\', \'c\' + $ e \'usd\', \'eur\', None + $ f 2020-01-01, 2021-01-02, 2022-01-01 + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Summary statistics for a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method used when calculating percentiles. + + Notes + ----- + The median is included by default as the 50% percentile. + + Warnings + -------- + We do not guarantee the output of `describe` to be stable. It will show + statistics that we deem informative, and may be updated in the future. + Using `describe` programmatically (versus interactive exploration) is + not recommended for this reason. + + See Also + -------- + glimpse + + Examples + -------- + >>> from datetime import date, time + >>> df = pl.DataFrame( + ... { + ... "float": [1.0, 2.8, 3.0], + ... "int": [40, 50, None], + ... "bool": [True, False, True], + ... "str": ["zz", "xx", "yy"], + ... "date": [date(2020, 1, 1), date(2021, 7, 5), date(2022, 12, 31)], + ... "time": [time(10, 20, 30), time(14, 45, 50), time(23, 15, 10)], + ... } + ... ) + + Show default frame statistics: + + >>> df.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐ + │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │ + │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ + │ 25% ┆ 2.8 ┆ 40.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 50% ┆ 2.8 ┆ 50.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 75% ┆ 3.0 ┆ 50.0 ┆ null ┆ null ┆ 2022-12-31 ┆ 23:15:10 │ + │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ + └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘ + + Customize which percentiles are displayed, applying linear interpolation: + + >>> df.describe( + ... percentiles=[0.1, 0.3, 0.5, 0.7, 0.9], + ... interpolation="linear", + ... ) + shape: (11, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐ + │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │ + │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ + │ 10% ┆ 1.36 ┆ 41.0 ┆ null ┆ null ┆ 2020-04-20 ┆ 11:13:34 │ + │ 30% ┆ 2.08 ┆ 43.0 ┆ null ┆ null ┆ 2020-11-26 ┆ 12:59:42 │ + │ 50% ┆ 2.8 ┆ 45.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 70% ┆ 2.88 ┆ 47.0 ┆ null ┆ null ┆ 2022-02-07 ┆ 18:09:34 │ + │ 90% ┆ 2.96 ┆ 49.0 ┆ null ┆ null ┆ 2022-09-13 ┆ 21:33:18 │ + │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ + └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘ + ''' + def get_column_index(self, name: str) -> int: + ''' + Find the index of a column by name. + + Parameters + ---------- + name + Name of the column to find. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + ... ) + >>> df.get_column_index("ham") + 2 + ''' + def replace_column(self, index: int, column: Series) -> Self: + ''' + Replace a column at an index location. + + This operation is in place. + + Parameters + ---------- + index + Column index. + column + Series that will replace the column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> s = pl.Series("apple", [10, 20, 30]) + >>> df.replace_column(0, s) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 10 ┆ 6 ┆ a │ + │ 20 ┆ 7 ┆ b │ + │ 30 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> DataFrame: + ''' + Sort the dataframe by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> df.sort("a") + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.sort(["c", "a"], descending=True) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.sort("c", "a", descending=[False, True]) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + ''' + def top_k(self, k: int) -> DataFrame: + ''' + Return the `k` largest elements. + + If `descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the `k` smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> df.top_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> df.top_k(4, by=["b", "a"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + ''' + def bottom_k(self, k: int) -> DataFrame: + ''' + Return the `k` smallest elements. + + If `descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the `k` largest. Bottom-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> df.bottom_k(4, by="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> df.bottom_k(4, by=["a", "b"]) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + ''' + def equals(self, other: DataFrame) -> bool: + ''' + Check whether the DataFrame is equal to another DataFrame. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + + See Also + -------- + assert_frame_equal + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 2, 1], + ... "bar": [8.0, 7.0, 6.0], + ... "ham": ["c", "b", "a"], + ... } + ... ) + >>> df1.equals(df1) + True + >>> df1.equals(df2) + False + ''' + def replace(self, column: str, new_column: Series) -> Self: + ''' + Replace a column by a new Series. + + Parameters + ---------- + column + Column to replace. + new_column + New column to insert. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> s = pl.Series([10, 20, 30]) + >>> df.replace("foo", s) # works in-place! # doctest: +SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 4 │ + │ 20 ┆ 5 │ + │ 30 ┆ 6 │ + └─────┴─────┘ + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.slice(1, 2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + tail, glimpse, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.head(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> df.head(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 4, 5], + ... "bar": [6, 7, 8, 9, 10], + ... "ham": ["a", "b", "c", "d", "e"], + ... } + ... ) + >>> df.tail(3) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> df.tail(-3) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 9 ┆ d │ + │ 5 ┆ 10 ┆ e │ + └─────┴─────┴─────┘ + ''' + def limit(self, n: int = ...) -> Self: + """ + Get the first `n` rows. + + Alias for :func:`DataFrame.head`. + + Parameters + ---------- + n + Number of rows to return. If a negative value is passed, return all rows + except the last `abs(n)`. + + See Also + -------- + head + """ + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> df.drop_nulls() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> df.drop_nulls(subset=cs.integer()) + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + Below are some additional examples that show how to drop null + values based on other conditions. + + >>> df = pl.DataFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> df + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> df.filter(~pl.all_horizontal(pl.all().is_null())) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + + Drop a column if all values are null: + + >>> df[[s.name for s in df if not (s.null_count() == df.height)]] + shape: (4, 2) + ┌──────┬──────┐ + │ b ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 1 ┆ 1 │ + │ 2 ┆ null │ + │ null ┆ null │ + │ 1 ┆ 1 │ + └──────┴──────┘ + ''' + def pipe(self, function: Callable[Concatenate[DataFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Notes + ----- + It is recommended to use LazyFrame when piping operations, in order + to fully take advantage of query optimization and parallelization. + See :meth:`df.lazy() `. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]}) + >>> df.pipe(cast_str_to_int, col_name="b") + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]}) + >>> df + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns))) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + ''' + def with_row_index(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a row index as the first column in the DataFrame. + + Parameters + ---------- + name + Name of the index column. + offset + Start the index at this offset. Cannot be negative. + + Notes + ----- + The resulting column does not have any special properties. It is a regular + column of type `UInt32` (or `UInt64` in `polars-u64-idx`). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_index() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ index ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞═══════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └───────┴─────┴─────┘ + >>> df.with_row_index("id", offset=1000) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ id ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1000 ┆ 1 ┆ 2 │ + │ 1001 ┆ 3 ┆ 4 │ + │ 1002 ┆ 5 ┆ 6 │ + └──────┴─────┴─────┘ + + An index column can also be created using the expressions :func:`int_range` + and :func:`len`. + + >>> df.select( + ... pl.int_range(pl.len(), dtype=pl.UInt32).alias("index"), + ... pl.all(), + ... ) + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ index ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞═══════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └───────┴─────┴─────┘ + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + .. deprecated:: + Use :meth:`with_row_index` instead. + Note that the default column name has changed from \'row_nr\' to \'index\'. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. Default = 0 + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_count() # doctest: +SKIP + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + ''' + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> df.group_by("a", maintain_order=True).agg(pl.col("c")) + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + + The `GroupBy` object returned by this method is iterable, returning the name + and data of each group. + + >>> for name, data in df.group_by("a"): # doctest: +SKIP + ... print(name) + ... print(data) + a + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘ + b + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘ + c + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def rolling(self, index_column: IntoExpr) -> RollingGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `group_by_dynamic` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals use + :func:`DataFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - **"1i" # length 1** + - **"10i" # length 10** + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling operation on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + RollingGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = df.rolling(index_column="dt", period="2d").agg( + ... [ + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ] + ... ) + >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1] + >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1] + >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1] + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> df + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> df.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()) + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n")) + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n")) + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> df + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> df.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")) + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> df = pl.DataFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> ( + ... df.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")) + ... ) + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + ''' + def upsample(self, time_column: str) -> Self: + ''' + Upsample a DataFrame at a regular frequency. + + The `every` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + + - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + Parameters + ---------- + time_column + Time column will be used to determine a date_range. + Note that this column has to be sorted for the output to make sense. + every + Interval will start \'every\' duration. + offset + Change the start of the date_range by this offset. + by + First group by these columns and then upsample for every group. + maintain_order + Keep the ordering predictable. This is slower. + + Returns + ------- + DataFrame + Result will be sorted by `time_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + Examples + -------- + Upsample a DataFrame by a certain interval. + + >>> from datetime import datetime + >>> df = pl.DataFrame( + ... { + ... "time": [ + ... datetime(2021, 2, 1), + ... datetime(2021, 4, 1), + ... datetime(2021, 5, 1), + ... datetime(2021, 6, 1), + ... ], + ... "groups": ["A", "B", "A", "B"], + ... "values": [0, 1, 2, 3], + ... } + ... ).set_sorted("time") + >>> df.upsample( + ... time_column="time", every="1mo", by="groups", maintain_order=True + ... ).select(pl.all().forward_fill()) + shape: (7, 3) + ┌─────────────────────┬────────┬────────┐ + │ time ┆ groups ┆ values │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ i64 │ + ╞═════════════════════╪════════╪════════╡ + │ 2021-02-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-03-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-04-01 00:00:00 ┆ A ┆ 0 │ + │ 2021-05-01 00:00:00 ┆ A ┆ 2 │ + │ 2021-04-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-05-01 00:00:00 ┆ B ┆ 1 │ + │ 2021-06-01 00:00:00 ┆ B ┆ 3 │ + └─────────────────────┴────────┴────────┘ + ''' + def join_asof(self, other: DataFrame) -> DataFrame: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the asof_join key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + - A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + join on these columns before doing asof join + by_left + join on these columns before doing asof join + by_right + join on these columns before doing asof join + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.DataFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward") + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + ''' + def join(self, other: DataFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> DataFrame: + ''' + Join in SQL-like fashion. + + Parameters + ---------- + other + DataFrame to join with. + on + Name(s) of the join columns in both DataFrames. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\', \'outer_coalesce\'} + Join strategy. + + * *inner* + Returns rows that have matching values in both tables + * *left* + Returns all rows from the left table, and the matched rows from the + right table + * *outer* + Returns all rows when there is a match in either left or right table + * *outer_coalesce* + Same as \'outer\', but coalesces the key columns + * *cross* + Returns the cartisian product of rows from both tables + * *semi* + Filter rows that have a match in the right table. + * *anti* + Filter rows that not have a match in the right table. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Name(s) of the left join column(s). + right_on + Name(s) of the right join column(s). + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + join_nulls + Join on null values. By default null values will never produce matches. + + Returns + ------- + DataFrame + + See Also + -------- + join_asof + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_df = pl.DataFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> df.join(other_df, on="ham") + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="outer") + shape: (4, 5) + ┌──────┬──────┬──────┬───────┬───────────┐ + │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞══════╪══════╪══════╪═══════╪═══════════╡ + │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │ + │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │ + │ null ┆ null ┆ null ┆ z ┆ d │ + │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │ + └──────┴──────┴──────┴───────┴───────────┘ + + >>> df.join(other_df, on="ham", how="left") + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + + >>> df.join(other_df, on="ham", how="semi") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + + >>> df.join(other_df, on="ham", how="anti") + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + Notes + ----- + For joining on columns with categorical data, see `pl.StringCache()`. + ''' + def map_rows(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + ''' + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF will receive each row as a tuple of values: `udf(row)`. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema. + + Notes + ----- + * The frame-level `apply` cannot track column names (as the UDF is a black-box + that may arbitrarily drop, rearrange, transform, or add new columns); if you + want to apply a UDF such that column names are preserved, you should use the + expression-level `apply` syntax instead. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]}) + + Return a DataFrame by mapping each row to a tuple: + + >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3)) + shape: (3, 2) + ┌──────────┬──────────┐ + │ column_0 ┆ column_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════════╪══════════╡ + │ 2 ┆ -3 │ + │ 4 ┆ 15 │ + │ 6 ┆ 24 │ + └──────────┴──────────┘ + + However, it is much better to implement this with a native expression: + + >>> df.select( + ... pl.col("foo") * 2, + ... pl.col("bar") * 3, + ... ) # doctest: +IGNORE_RESULT + + Return a DataFrame with a single column by mapping each row to a scalar: + + >>> df.map_rows(lambda t: (t[0] * 2 + t[1])) # doctest: +SKIP + shape: (3, 1) + ┌───────┐ + │ apply │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 1 │ + │ 9 │ + │ 14 │ + └───────┘ + + In this case it is better to use the following native expression: + + >>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT + ''' + def hstack(self, columns: list[Series] | DataFrame) -> Self: + ''' + Return a new DataFrame grown horizontally by stacking multiple Series to it. + + Parameters + ---------- + columns + Series to stack. + in_place + Modify in place. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> x = pl.Series("apple", [10, 20, 30]) + >>> df.hstack([x]) + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6 ┆ a ┆ 10 │ + │ 2 ┆ 7 ┆ b ┆ 20 │ + │ 3 ┆ 8 ┆ c ┆ 30 │ + └─────┴─────┴─────┴───────┘ + ''' + def vstack(self, other: DataFrame) -> Self: + ''' + Grow this DataFrame vertically by stacking a DataFrame to it. + + Parameters + ---------- + other + DataFrame to stack. + in_place + Modify in place. + + See Also + -------- + extend + + Examples + -------- + >>> df1 = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [6, 7], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df2 = pl.DataFrame( + ... { + ... "foo": [3, 4], + ... "bar": [8, 9], + ... "ham": ["c", "d"], + ... } + ... ) + >>> df1.vstack(df2) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + │ 4 ┆ 9 ┆ d │ + └─────┴─────┴─────┘ + ''' + def extend(self, other: DataFrame) -> Self: + ''' + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of + this `DataFrame`, `extend` appends the data from `other` to the underlying + memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows and rerun + a query. + + Prefer `vstack` over `extend` when you want to append many times before + doing a query. For instance, when you read in multiple files and want to store + them in a single `DataFrame`. In the latter case, finish the sequence of + `vstack` operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + + Warnings + -------- + This method modifies the dataframe in-place. The dataframe is returned for + convenience only. + + See Also + -------- + vstack + + Examples + -------- + >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]}) + >>> df1.extend(df2) + shape: (6, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + │ 10 ┆ 40 │ + │ 20 ┆ 50 │ + │ 30 ┆ 60 │ + └─────┴─────┘ + ''' + def drop(self, *columns: ColumnNameOrSelector | Iterable[ColumnNameOrSelector]) -> DataFrame: + ''' + Remove columns from the dataframe. + + Parameters + ---------- + *columns + Names of the columns that should be removed from the dataframe. + Accepts column selector input. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop("ham") + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a list of column names. + + >>> df.drop(["bar", "ham"]) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> df.drop(cs.numeric()) + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> df.drop("foo", "ham") + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + ''' + def drop_in_place(self, name: str) -> Series: + ''' + Drop a single column in-place and return the dropped column. + + Parameters + ---------- + name + Name of the column to drop. + + Returns + ------- + Series + The dropped column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.drop_in_place("ham") + shape: (3,) + Series: \'ham\' [str] + [ + "a" + "b" + "c" + ] + ''' + def cast(self, dtypes: Mapping[ColumnNameOrSelector | PolarsDataType, PolarsDataType] | PolarsDataType) -> DataFrame: + ''' + Cast DataFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns matching one dtype (or dtype group) to another dtype: + + >>> df.cast({pl.Date: pl.Datetime}) + shape: (3, 3) + ┌─────┬─────┬─────────────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ datetime[μs] │ + ╞═════╪═════╪═════════════════════╡ + │ 1 ┆ 6.0 ┆ 2020-01-02 00:00:00 │ + │ 2 ┆ 7.0 ┆ 2021-03-04 00:00:00 │ + │ 3 ┆ 8.0 ┆ 2022-05-06 00:00:00 │ + └─────┴─────┴─────────────────────┘ + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.String}) + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.String).to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + ''' + def clear(self, n: int = ...) -> Self: + ''' + Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame. + + Returns a `n`-row null-filled DataFrame with an identical schema. + `n` can be greater than the current number of rows in the DataFrame. + + Parameters + ---------- + n + Number of (null-filled) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.clear() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> df.clear(n=2) + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + ''' + def clone(self) -> Self: + ''' + Create a copy of this DataFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current DataFrame, with identical + schema but no data. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.clone() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true │ + │ 2 ┆ 4.0 ┆ true │ + │ 3 ┆ 10.0 ┆ false │ + │ 4 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + ''' + def get_columns(self) -> list[Series]: + ''' + Get the DataFrame as a List of Series. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_columns() + [shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ], shape: (3,) + Series: \'bar\' [i64] + [ + 4 + 5 + 6 + ]] + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.get_columns() + [shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ], shape: (4,) + Series: \'b\' [f64] + [ + 0.5 + 4.0 + 10.0 + 13.0 + ], shape: (4,) + Series: \'c\' [bool] + [ + true + true + false + true + ]] + ''' + def get_column(self, name: str) -> Series: + ''' + Get a single column by name. + + Parameters + ---------- + name : str + Name of the column to retrieve. + + Returns + ------- + Series + + See Also + -------- + to_series + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.get_column("foo") + shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 3 + ] + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> DataFrame: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertype of the fill `value`. + + Returns + ------- + DataFrame + DataFrame with None values replaced by the filling strategy. + + See Also + -------- + fill_nan + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> df.fill_null(99) + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> df.fill_null(strategy="forward") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="max") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> df.fill_null(strategy="zero") + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + ''' + def fill_nan(self, value: Expr | int | float | None) -> DataFrame: + ''' + Fill floating point NaN values by an Expression evaluation. + + Parameters + ---------- + value + Value with which to replace NaN values. + + Returns + ------- + DataFrame + DataFrame with NaN values replaced by the given value. + + Warnings + -------- + Note that floating point NaNs (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null`. + + See Also + -------- + fill_null + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> df.fill_nan(99) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> DataFrame: + ''' + Explode the dataframe to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or String datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────────┬───────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════════╪═══════════╡ + │ a ┆ [1] │ + │ a ┆ [2, 3] │ + │ b ┆ [4, 5] │ + │ c ┆ [6, 7, 8] │ + └─────────┴───────────┘ + >>> df.explode("numbers") + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + ''' + def pivot(self, values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None, aggregate_function: PivotAgg | Expr | None = ...) -> Self: + ''' + Create a spreadsheet-style pivot table as a DataFrame. + + Only available in eager mode. See "Examples" section below for how to do a + "lazy pivot" if you know the unique column values in advance. + + Parameters + ---------- + values + Column values to aggregate. Can be multiple columns if the *columns* + arguments contains multiple columns as well. + index + One or multiple keys to group by. + columns + Name of the column(s) whose values will be used as the header of the output + DataFrame. + aggregate_function + Choose from: + + - None: no aggregation takes place, will raise error if multiple values are in group. + - A predefined aggregate function string, one of + {\'min\', \'max\', \'first\', \'last\', \'sum\', \'mean\', \'median\', \'len\'} + - An expression to do the aggregation. + maintain_order + Sort the grouped keys so that the output order is predictable. + sort_columns + Sort the transposed columns by name. Default is by order of discovery. + separator + Used as separator/delimiter in generated column names. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": ["one", "one", "two", "two", "one", "two"], + ... "bar": ["y", "y", "y", "x", "x", "x"], + ... "baz": [1, 2, 3, 4, 5, 6], + ... } + ... ) + >>> df.pivot(values="baz", index="foo", columns="bar", aggregate_function="sum") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ y ┆ x │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ one ┆ 3 ┆ 5 │ + │ two ┆ 3 ┆ 10 │ + └─────┴─────┴─────┘ + + Pivot using selectors to determine the index/values/columns: + + >>> import polars.selectors as cs + >>> df.pivot( + ... values=cs.numeric(), + ... index=cs.string(), + ... columns=cs.string(), + ... aggregate_function="sum", + ... sort_columns=True, + ... ).sort( + ... by=cs.string(), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────┬──────┬──────┬──────┐ + │ foo ┆ bar ┆ one ┆ two ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪══════╪══════╪══════╪══════╡ + │ one ┆ x ┆ 5 ┆ null ┆ 5 ┆ null │ + │ one ┆ y ┆ 3 ┆ null ┆ null ┆ 3 │ + │ two ┆ x ┆ null ┆ 10 ┆ 10 ┆ null │ + │ two ┆ y ┆ null ┆ 3 ┆ null ┆ 3 │ + └─────┴─────┴──────┴──────┴──────┴──────┘ + + Run an expression as aggregation function + + >>> df = pl.DataFrame( + ... { + ... "col1": ["a", "a", "a", "b", "b", "b"], + ... "col2": ["x", "x", "x", "x", "y", "y"], + ... "col3": [6, 7, 3, 2, 5, 7], + ... } + ... ) + >>> df.pivot( + ... index="col1", + ... columns="col2", + ... values="col3", + ... aggregate_function=pl.element().tanh().mean(), + ... ) + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + + Note that `pivot` is only available in eager mode. If you know the unique + column values in advance, you can use :meth:`polars.LazyFrame.groupby` to + get the same result as above in lazy mode: + + >>> index = pl.col("col1") + >>> columns = pl.col("col2") + >>> values = pl.col("col3") + >>> unique_column_values = ["x", "y"] + >>> aggregate_function = lambda col: col.tanh().mean() + >>> ( + ... df.lazy() + ... .group_by(index) + ... .agg( + ... *[ + ... aggregate_function(values.filter(columns == value)).alias(value) + ... for value in unique_column_values + ... ] + ... ) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌──────┬──────────┬──────────┐ + │ col1 ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 │ + ╞══════╪══════════╪══════════╡ + │ a ┆ 0.998347 ┆ null │ + │ b ┆ 0.964028 ┆ 0.999954 │ + └──────┴──────────┴──────────┘ + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> df.melt(id_vars="a", value_vars=cs.numeric()) + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + ''' + def unstack(self, step: int, how: UnstackDirection = ..., columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., fill_values: list[Any] | None = ...) -> DataFrame: + ''' + Unstack a long table to a wide form without doing an aggregation. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + This can be much faster than a pivot, because it can skip the grouping phase. + + Parameters + ---------- + step + Number of rows in the unstacked frame. + how : { \'vertical\', \'horizontal\' } + Direction of the unstack. + columns + Column name(s) or selector(s) to include in the operation. + If set to `None` (default), use all columns. + fill_values + Fill values that don\'t fit the new size with this value. + + Examples + -------- + >>> from string import ascii_uppercase + >>> df = pl.DataFrame( + ... { + ... "x": list(ascii_uppercase[0:8]), + ... "y": pl.int_range(1, 9, eager=True), + ... } + ... ).with_columns( + ... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8), + ... ) + >>> df + shape: (8, 3) + ┌─────┬─────┬──────────┐ + │ x ┆ y ┆ z │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ list[u8] │ + ╞═════╪═════╪══════════╡ + │ A ┆ 1 ┆ [1, 2] │ + │ B ┆ 2 ┆ [2, 3] │ + │ C ┆ 3 ┆ [3, 4] │ + │ D ┆ 4 ┆ [4, 5] │ + │ E ┆ 5 ┆ [5, 6] │ + │ F ┆ 6 ┆ [6, 7] │ + │ G ┆ 7 ┆ [7, 8] │ + │ H ┆ 8 ┆ [8, 9] │ + └─────┴─────┴──────────┘ + >>> df.unstack(step=4, how="vertical") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │ + │ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │ + │ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │ + │ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> df.unstack(step=2, how="horizontal") + shape: (4, 6) + ┌─────┬─────┬─────┬─────┬──────────┬──────────┐ + │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │ + ╞═════╪═════╪═════╪═════╪══════════╪══════════╡ + │ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │ + │ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │ + │ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │ + │ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │ + └─────┴─────┴─────┴─────┴──────────┴──────────┘ + >>> import polars.selectors as cs + >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0) + shape: (5, 2) + ┌─────┬─────┐ + │ y_0 ┆ y_1 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + │ 4 ┆ 0 │ + │ 5 ┆ 0 │ + └─────┴─────┘ + ''' + def partition_by(self, by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector], *more_by: ColumnNameOrSelector) -> list[Self] | dict[Any, Self]: + ''' + Group by the given columns and return the groups as separate dataframes. + + Parameters + ---------- + by + Column name(s) or selector(s) to group by. + *more_by + Additional names of columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default partition by operation. + include_key + Include the columns used to partition the DataFrame in the output. + as_dict + Return a dictionary instead of a list. The dictionary keys are tuples of + the distinct group values that identify each group. If a single string + was passed to `by`, the keys are a single value instead of a tuple. + + Examples + -------- + Pass a single column name to partition by that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.partition_by("a") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Partition by multiple columns by either passing a list of column names, or by + specifying each column name as a positional argument. + + >>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT + [shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘] + + Return the partitions as a dictionary by specifying `as_dict=True`. + + >>> import polars.selectors as cs + >>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT + {(\'a\',): shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ a ┆ 1 ┆ 3 │ + └─────┴─────┴─────┘, + (\'b\',): shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + └─────┴─────┴─────┘, + (\'c\',): shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘} + ''' + def shift(self, n: int = ...) -> DataFrame: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> df.shift() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.shift(-2) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.shift(-2, fill_value=100) + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + ''' + def is_duplicated(self) -> Series: + ''' + Get a mask of all duplicated rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_duplicated() + shape: (4,) + Series: \'\' [bool] + [ + true + false + false + true + ] + + This mask can be used to visualize the duplicated lines like this: + + >>> df.filter(df.is_duplicated()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 1 ┆ x │ + │ 1 ┆ x │ + └─────┴─────┘ + ''' + def is_unique(self) -> Series: + ''' + Get a mask of all unique rows in this DataFrame. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + ... ) + >>> df.is_unique() + shape: (4,) + Series: \'\' [bool] + [ + false + true + true + false + ] + + This mask can be used to visualize the unique lines like this: + + >>> df.filter(df.is_unique()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 2 ┆ y │ + │ 3 ┆ z │ + └─────┴─────┘ + ''' + def lazy(self) -> LazyFrame: + ''' + Start a lazy query from this point. This returns a `LazyFrame` object. + + Operations on a `LazyFrame` are not executed until this is requested by either + calling: + + * :meth:`.fetch() ` + (run on a small number of rows) + * :meth:`.collect() ` + (run on all data) + * :meth:`.describe_plan() ` + (print unoptimized query plan) + * :meth:`.describe_optimized_plan() ` + (print optimized query plan) + * :meth:`.show_graph() ` + (show (un)optimized query plan as graphviz graph) + + Lazy operations are advised because they allow for query optimization and more + parallelization. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> df.lazy() # doctest: +ELLIPSIS + + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Select columns from this DataFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.select("foo") + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> df.select(["foo", "bar"]) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> df.select(pl.col("foo"), pl.col("bar") + 1) + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0)) + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ) + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Select columns from this DataFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + """ + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + ''' + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + Notes + ----- + Creating a new DataFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.with_columns((pl.col("a") ** 2).alias("a^2")) + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> df.with_columns(pl.col("a").cast(pl.Float64)) + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> df.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> df.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ) + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> df.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ) + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... df.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ) + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> DataFrame: + """ + Add columns to this DataFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + DataFrame + A new DataFrame with the columns added. + + See Also + -------- + with_columns + """ + def n_chunks(self, strategy: str = ...) -> int | list[int]: + ''' + Get number of chunks used by the ChunkedArrays of this DataFrame. + + Parameters + ---------- + strategy : {\'first\', \'all\'} + Return the number of chunks of the \'first\' column, + or \'all\' columns in this DataFrame. + + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> df.n_chunks() + 1 + >>> df.n_chunks(strategy="all") + [1, 1, 1] + ''' + def max(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their maximum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`max_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.max() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + ''' + def max_horizontal(self) -> Series: + ''' + Get the maximum value horizontally across columns. + + Returns + ------- + Series + A Series named `"max"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.max_horizontal() + shape: (3,) + Series: \'max\' [f64] + [ + 4.0 + 5.0 + 6.0 + ] + ''' + def min(self, axis: int | None = ...) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their minimum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`min_horizontal`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.min() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + ''' + def min_horizontal(self) -> Series: + ''' + Get the minimum value horizontally across columns. + + Returns + ------- + Series + A Series named `"min"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.min_horizontal() + shape: (3,) + Series: \'min\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + ''' + def sum(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their sum value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`sum_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sum() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 6 ┆ 21 ┆ null │ + └─────┴─────┴──────┘ + ''' + def sum_horizontal(self) -> Series: + ''' + Sum all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"sum"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.sum_horizontal() + shape: (3,) + Series: \'sum\' [f64] + [ + 5.0 + 7.0 + 9.0 + ] + ''' + def mean(self) -> Self | Series: + ''' + Aggregate the columns of this DataFrame to their mean value. + + Parameters + ---------- + axis + Either 0 (vertical) or 1 (horizontal). + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. This method will only + support vertical aggregation, as if `axis` were set to `0`. + To perform horizontal aggregation, use :meth:`mean_horizontal`. + null_strategy : {\'ignore\', \'propagate\'} + This argument is only used if `axis == 1`. + + .. deprecated:: 0.19.14 + This argument will be removed in a future version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... "spam": [True, False, None], + ... } + ... ) + >>> df.mean() + shape: (1, 4) + ┌─────┬─────┬──────┬──────┐ + │ foo ┆ bar ┆ ham ┆ spam │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str ┆ f64 │ + ╞═════╪═════╪══════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │ + └─────┴─────┴──────┴──────┘ + ''' + def mean_horizontal(self) -> Series: + ''' + Take the mean of all values horizontally across columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + If set to `False`, any null value in the input will lead to a null output. + + Returns + ------- + Series + A Series named `"mean"`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [4.0, 5.0, 6.0], + ... } + ... ) + >>> df.mean_horizontal() + shape: (3,) + Series: \'mean\' [f64] + [ + 2.5 + 3.5 + 4.5 + ] + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.std() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.std(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.816497 ┆ 0.816497 ┆ null │ + └──────────┴──────────┴──────┘ + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.var() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1.0 ┆ 1.0 ┆ null │ + └─────┴─────┴──────┘ + >>> df.var(ddof=0) + shape: (1, 3) + ┌──────────┬──────────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞══════════╪══════════╪══════╡ + │ 0.666667 ┆ 0.666667 ┆ null │ + └──────────┴──────────┴──────┘ + ''' + def median(self) -> Self: + ''' + Aggregate the columns of this DataFrame to their median value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.median() + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + ''' + def product(self) -> DataFrame: + ''' + Aggregate the columns of this DataFrame to their product values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [0.5, 4, 10], + ... "c": [True, True, False], + ... } + ... ) + + >>> df.product() + shape: (1, 3) + ┌─────┬──────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪══════╪═════╡ + │ 6 ┆ 20.0 ┆ 0 │ + └─────┴──────┴─────┘ + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns of this DataFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.quantile(0.5, "nearest") + shape: (1, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 2.0 ┆ 7.0 ┆ null │ + └─────┴─────┴──────┘ + ''' + def to_dummies(self, columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Convert categorical variables into dummy/indicator variables. + + Parameters + ---------- + columns + Column name(s) or selector(s) that should be converted to dummy + variables. If set to `None` (default), convert all columns. + separator + Separator/delimiter used when generating column names. + drop_first + Remove the first category from the variables being encoded. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2], + ... "bar": [3, 4], + ... "ham": ["a", "b"], + ... } + ... ) + >>> df.to_dummies() + shape: (2, 6) + ┌───────┬───────┬───────┬───────┬───────┬───────┐ + │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ + └───────┴───────┴───────┴───────┴───────┴───────┘ + + >>> df.to_dummies(drop_first=True) + shape: (2, 3) + ┌───────┬───────┬───────┐ + │ foo_2 ┆ bar_4 ┆ ham_b │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═══════╪═══════╪═══════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + └───────┴───────┴───────┘ + + >>> import polars.selectors as cs + >>> df.to_dummies(cs.integer(), separator=":") + shape: (2, 5) + ┌───────┬───────┬───────┬───────┬─────┐ + │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═══════╪═══════╪═════╡ + │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │ + │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │ + └───────┴───────┴───────┴───────┴─────┘ + + >>> df.to_dummies(cs.integer(), drop_first=True, separator=":") + shape: (2, 3) + ┌───────┬───────┬─────┐ + │ foo:2 ┆ bar:4 ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str │ + ╞═══════╪═══════╪═════╡ + │ 0 ┆ 0 ┆ a │ + │ 1 ┆ 1 ┆ b │ + └───────┴───────┴─────┘ + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> DataFrame: + ''' + Drop duplicate rows from this dataframe. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + DataFrame + DataFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> df.unique(maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(subset=["bar", "ham"], maintain_order=True) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> df.unique(keep="last", maintain_order=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + ''' + def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = ...) -> int: + ''' + Return the number of unique rows, or the number of unique row-subsets. + + Parameters + ---------- + subset + One or more columns/expressions that define what to count; + omit to return the count of unique rows. + + Notes + ----- + This method operates at the `DataFrame` level; to operate on subsets at the + expression level you can make use of struct-packing instead, for example: + + >>> expr_unique_subset = pl.struct(["a", "b"]).n_unique() + + If instead you want to count the number of unique values per-column, you can + also use expression-level syntax to return a new frame containing that result: + + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) + >>> df_nunique = df.select(pl.all().n_unique()) + + In aggregate context there is also an equivalent method for returning the + unique values per-group: + + >>> df_agg_nunique = df.group_by(by=["a"]).n_unique() + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3, 4, 5], + ... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0], + ... "c": [True, True, True, False, True, True], + ... } + ... ) + >>> df.n_unique() + 5 + + Simple columns subset. + + >>> df.n_unique(subset=["b", "c"]) + 4 + + Expression subset. + + >>> df.n_unique( + ... subset=[ + ... (pl.col("a") // 2), + ... (pl.col("c") | (pl.col("b") >= 2)), + ... ], + ... ) + 3 + ''' + def approx_n_unique(self) -> DataFrame: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> df.approx_n_unique() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + ''' + def rechunk(self) -> Self: + """ + Rechunk the data in this DataFrame to a contiguous allocation. + + This will make sure all subsequent operations have optimal and predictable + performance. + """ + def null_count(self) -> Self: + ''' + Create a new DataFrame that shows the null counts per column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.null_count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + ''' + def sample(self, n: int | Series | None = ...) -> Self: + ''' + Sample from this DataFrame. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + If set to True, the order of the sampled rows will be shuffled. If + set to False (default), the order of the returned rows will be + neither stable nor fully random. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8 ┆ c │ + │ 2 ┆ 7 ┆ b │ + └─────┴─────┴─────┘ + ''' + def fold(self, operation: Callable[[Series, Series], Series]) -> Series: + ''' + Apply a horizontal reduction on a DataFrame. + + This can be used to effectively determine aggregations on a row level, and can + be applied to any DataType that can be supercasted (casted to a similar parent + type). + + An example of the supercast rules when applying an arithmetic operation on two + DataTypes are for instance: + + - Int8 + String = String + - Float32 + Int64 = Float32 + - Float32 + Float64 = Float64 + + Examples + -------- + A horizontal sum operation: + + >>> df = pl.DataFrame( + ... { + ... "a": [2, 1, 3], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [f64] + [ + 4.0 + 5.0 + 9.0 + ] + + A horizontal minimum operation: + + >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]}) + >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)) + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 1.0 + 3.0 + ] + + A horizontal string concatenation: + + >>> df = pl.DataFrame( + ... { + ... "a": ["foo", "bar", 2], + ... "b": [1, 2, 3], + ... "c": [1.0, 2.0, 3.0], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 + s2) + shape: (3,) + Series: \'a\' [str] + [ + "foo11.0" + "bar22.0" + null + ] + + A horizontal boolean or, similar to a row-wise .any(): + + >>> df = pl.DataFrame( + ... { + ... "a": [False, False, True], + ... "b": [False, True, False], + ... } + ... ) + >>> df.fold(lambda s1, s2: s1 | s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + + Parameters + ---------- + operation + function that takes two `Series` and returns a `Series`. + ''' + def row(self, index: int | None = ...) -> tuple[Any, ...] | dict[str, Any]: + ''' + Get the values of a single row, either by index or by predicate. + + Parameters + ---------- + index + Row index. + by_predicate + Select the row according to a given expression/predicate. + named + Return a dictionary instead of a tuple. The dictionary is a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Returns + ------- + tuple (default) or dictionary of row values + + Notes + ----- + The `index` and `by_predicate` params are mutually exclusive. Additionally, + to ensure clarity, the `by_predicate` parameter must be supplied by keyword. + + When using `by_predicate` it is an error condition if anything other than + one row is returned; more than one row raises `TooManyRowsReturnedError`, and + zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`). + + Warnings + -------- + You should NEVER use this method to iterate over a DataFrame; if you require + row-iteration you should strongly prefer use of `iter_rows()` instead. + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows : Materialise all frame data as a list of rows (potentially expensive). + item: Return dataframe element as a scalar. + + Examples + -------- + Specify an index to return the row at the given index as a tuple. + + >>> df = pl.DataFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> df.row(2) + (3, 8, \'c\') + + Specify `named=True` to get a dictionary instead with a mapping of column + names to row values. + + >>> df.row(2, named=True) + {\'foo\': 3, \'bar\': 8, \'ham\': \'c\'} + + Use `by_predicate` to return the row that matches the given predicate. + + >>> df.row(by_predicate=(pl.col("ham") == "b")) + (2, 7, \'b\') + ''' + def rows(self) -> list[tuple[Any, ...]] | list[dict[str, Any]]: + ''' + Returns all data in the DataFrame as a list of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row-iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods. + Where possible you should also consider using `iter_rows` instead to avoid + materialising all the data at once. + + Returns + ------- + list of tuples (default) or dictionaries of row values + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "x": ["a", "b", "b", "a"], + ... "y": [1, 2, 3, 4], + ... "z": [0, 3, 6, 9], + ... } + ... ) + >>> df.rows() + [(\'a\', 1, 0), (\'b\', 2, 3), (\'b\', 3, 6), (\'a\', 4, 9)] + >>> df.rows(named=True) + [{\'x\': \'a\', \'y\': 1, \'z\': 0}, + {\'x\': \'b\', \'y\': 2, \'z\': 3}, + {\'x\': \'b\', \'y\': 3, \'z\': 6}, + {\'x\': \'a\', \'y\': 4, \'z\': 9}] + ''' + def rows_by_key(self, key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector]) -> dict[Any, Iterable[Any]]: + ''' + Returns DataFrame data as a keyed dictionary of python-native values. + + Note that this method should not be used in place of native operations, due to + the high cost of materializing all frame data out into a dictionary; it should + be used only when you need to move the values out into a Python data structure + or other object that cannot operate directly with Polars/Arrow. + + Parameters + ---------- + key + The column(s) to use as the key for the returned dictionary. If multiple + columns are specified, the key will be a tuple of those values, otherwise + it will be a string. + named + Return dictionary rows instead of tuples, mapping column name to row value. + include_key + Include key values inline with the associated data (by default the key + values are omitted as a memory/performance optimisation, as they can be + reoconstructed from the key). + unique + Indicate that the key is unique; this will result in a 1:1 mapping from + key to a single associated row. Note that if the key is *not* actually + unique the last row with the given key will be returned. + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + See Also + -------- + rows : Materialize all frame data as a list of rows (potentially expensive). + iter_rows : Row iterator over frame data (does not materialize all rows). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "w": ["a", "b", "b", "a"], + ... "x": ["q", "q", "q", "k"], + ... "y": [1.0, 2.5, 3.0, 4.5], + ... "z": [9, 8, 7, 6], + ... } + ... ) + + Group rows by the given key column(s): + + >>> df.rows_by_key(key=["w"]) + defaultdict(, + {\'a\': [(\'q\', 1.0, 9), (\'k\', 4.5, 6)], + \'b\': [(\'q\', 2.5, 8), (\'q\', 3.0, 7)]}) + + Return the same row groupings as dictionaries: + + >>> df.rows_by_key(key=["w"], named=True) + defaultdict(, + {\'a\': [{\'x\': \'q\', \'y\': 1.0, \'z\': 9}, + {\'x\': \'k\', \'y\': 4.5, \'z\': 6}], + \'b\': [{\'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'x\': \'q\', \'y\': 3.0, \'z\': 7}]}) + + Return row groupings, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], unique=True) + {9: (\'a\', \'q\', 1.0), + 8: (\'b\', \'q\', 2.5), + 7: (\'b\', \'q\', 3.0), + 6: (\'a\', \'k\', 4.5)} + + Return row groupings as dictionaries, assuming keys are unique: + + >>> df.rows_by_key(key=["z"], named=True, unique=True) + {9: {\'w\': \'a\', \'x\': \'q\', \'y\': 1.0}, + 8: {\'w\': \'b\', \'x\': \'q\', \'y\': 2.5}, + 7: {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0}, + 6: {\'w\': \'a\', \'x\': \'k\', \'y\': 4.5}} + + Return dictionary rows grouped by a compound key, including key values: + + >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True) + defaultdict(, + {(\'a\', \'q\'): [{\'w\': \'a\', \'x\': \'q\', \'y\': 1.0, \'z\': 9}], + (\'b\', \'q\'): [{\'w\': \'b\', \'x\': \'q\', \'y\': 2.5, \'z\': 8}, + {\'w\': \'b\', \'x\': \'q\', \'y\': 3.0, \'z\': 7}], + (\'a\', \'k\'): [{\'w\': \'a\', \'x\': \'k\', \'y\': 4.5, \'z\': 6}]}) + ''' + def iter_rows(self) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: + ''' + Returns an iterator over the DataFrame of rows of python-native values. + + Parameters + ---------- + named + Return dictionaries instead of tuples. The dictionaries are a mapping of + column name to row value. This is more expensive than returning a regular + tuple, but allows for accessing values by column name. + buffer_size + Determines the number of rows that are buffered internally while iterating + over the data; you should only modify this in very specific cases where the + default value is determined not to be a good fit to your access pattern, as + the speedup from using the buffer is significant (~2-4x). Setting this + value to zero disables row buffering (not recommended). + + Notes + ----- + If you have `ns`-precision temporal values you should be aware that Python + natively only supports up to `μs`-precision; `ns`-precision values will be + truncated to microseconds on conversion to Python. If this matters to your + use-case you should export to a different format (such as Arrow or NumPy). + + Warnings + -------- + Row iteration is not optimal as the underlying data is stored in columnar form; + where possible, prefer export via one of the dedicated export/output methods + that deals with columnar data. + + Returns + ------- + iterator of tuples (default) or dictionaries (if named) of python row values + + See Also + -------- + rows : Materialises all frame data as a list of rows (potentially expensive). + rows_by_key : Materialises frame data as a key-indexed dictionary. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [row[0] for row in df.iter_rows()] + [1, 3, 5] + >>> [row["b"] for row in df.iter_rows(named=True)] + [2, 4, 6] + ''' + def iter_columns(self) -> Iterator[Series]: + ''' + Returns an iterator over the DataFrame\'s columns. + + Notes + ----- + Consider whether you can use :func:`all` instead. + If you can, it will be more efficient. + + Returns + ------- + Iterator of Series. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> [s.name for s in df.iter_columns()] + [\'a\', \'b\'] + + If you\'re using this to modify a dataframe\'s columns, e.g. + + >>> # Do NOT do this + >>> pl.DataFrame(column * 2 for column in df.iter_columns()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + + then consider whether you can use :func:`all` instead: + + >>> df.select(pl.all() * 2) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 4 │ + │ 6 ┆ 8 │ + │ 10 ┆ 12 │ + └─────┴─────┘ + ''' + def iter_slices(self, n_rows: int = ...) -> Iterator[DataFrame]: + ''' + Returns a non-copying iterator of slices over the underlying DataFrame. + + Parameters + ---------- + n_rows + Determines the number of rows contained in each DataFrame slice. + + Examples + -------- + >>> from datetime import date + >>> df = pl.DataFrame( + ... data={ + ... "a": range(17_500), + ... "b": date(2023, 1, 1), + ... "c": "klmnoopqrstuvwxyz", + ... }, + ... schema_overrides={"a": pl.Int32}, + ... ) + >>> for idx, frame in enumerate(df.iter_slices()): + ... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}") + DataFrame:[0]:10000 + DataFrame:[1]:7500 + + Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and + any supported frame export/conversion types; for example, as RecordBatches: + + >>> for frame in df.iter_slices(n_rows=15_000): + ... record_batch = frame.to_arrow().to_batches()[0] + ... print(f"{record_batch.schema}\\n<< {len(record_batch)}") + a: int32 + b: date32[day] + c: large_string + << 15000 + a: int32 + b: date32[day] + c: large_string + << 2500 + + See Also + -------- + iter_rows : Row iterator over frame data (does not materialise all rows). + partition_by : Split into multiple DataFrames, partitioned by groups. + ''' + def shrink_to_fit(self) -> Self: + """ + Shrink DataFrame memory usage. + + Shrinks to fit the exact capacity needed to hold the data. + """ + def gather_every(self, n: int, offset: int = ...) -> DataFrame: + ''' + Take every nth row in the DataFrame and return as a new DataFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + + Examples + -------- + >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + >>> s.gather_every(2) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + + >>> s.gather_every(2, offset=1) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + ''' + def hash_rows(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash and combine the rows in this DataFrame. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash_rows` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 3, 4], + ... "ham": ["a", "b", None, "d"], + ... } + ... ) + >>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT + shape: (4,) + Series: \'\' [u64] + [ + 10783150408545073287 + 1438741209321515184 + 10047419486152048166 + 2047317070637311557 + ] + ''' + def interpolate(self) -> DataFrame: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> df.interpolate() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + ''' + def is_empty(self) -> bool: + ''' + Check if the dataframe is empty. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + >>> df.is_empty() + False + >>> df.filter(pl.col("foo") > 99).is_empty() + True + ''' + def to_struct(self, name: str = ...) -> Series: + ''' + Convert a `DataFrame` to a `Series` of type `Struct`. + + Parameters + ---------- + name + Name for the struct Series + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": ["one", "two", "three", "four", "five"], + ... } + ... ) + >>> df.to_struct("nums") + shape: (5,) + Series: \'nums\' [struct[2]] + [ + {1,"one"} + {2,"two"} + {3,"three"} + {4,"four"} + {5,"five"} + ] + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the dataframe at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct") + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + ''' + def corr(self, **kwargs: Any) -> DataFrame: + ''' + Return pairwise Pearson product-moment correlation coefficients between columns. + + See numpy `corrcoef` for more information: + https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html + + Notes + ----- + This functionality requires numpy to be installed. + + Parameters + ---------- + **kwargs + Keyword arguments are passed to numpy `corrcoef`. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]}) + >>> df.corr() + shape: (3, 3) + ┌──────┬──────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════╡ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + │ -1.0 ┆ 1.0 ┆ -1.0 │ + │ 1.0 ┆ -1.0 ┆ 1.0 │ + └──────┴──────┴──────┘ + ''' + def merge_sorted(self, other: DataFrame, key: str) -> DataFrame: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both DataFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.DataFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0 + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.DataFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1 + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age") + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> DataFrame: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: DataFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ...) -> DataFrame: + ''' + Update the values in this `DataFrame` with the values in `other`. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + By default, null values in the right frame are ignored. Use + `include_nulls=False` to overwrite values in this frame with + null values in the other frame. + + Parameters + ---------- + other + DataFrame that will be used to update the values + on + Column names that will be joined on. If set to `None` (default), + the implicit row index of each frame is used as a join key. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + include_nulls + If True, null values from the right dataframe will be used to update the + left dataframe. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce + when `include_nulls = False` + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> df + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_df = pl.DataFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> df.update(new_df) + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> df.update(new_df, how="inner") + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer") + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> df.update( + ... new_df, left_on="A", right_on="C", how="outer", include_nulls=True + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + ''' + def count(self) -> DataFrame: + ''' + Return the number of non-null elements for each column. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"a": [1, 2, 3, 4], "b": [1, 2, 1, None], "c": [None, None, None, None]} + ... ) + >>> df.count() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 3 ┆ 0 │ + └─────┴─────┴─────┘ + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> GroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + .. note:: + Within each group, the order of rows is always preserved, regardless + of this argument. + + Returns + ------- + GroupBy + Object which can be used to perform aggregations. + """ + def groupby_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + """ + def group_by_rolling(self, index_column: IntoExpr) -> RollingGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`DataFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + """ + def groupby_dynamic(self, index_column: IntoExpr) -> DynamicGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + DynamicGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + ''' + def apply(self, function: Callable[[tuple[Any, ...]], Any], return_dtype: PolarsDataType | None = ...) -> DataFrame: + """ + Apply a custom/user-defined function (UDF) over the rows of the DataFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`DataFrame.map_rows`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output type of the operation. If none given, Polars tries to infer the type. + inference_size + Only used in the case when the custom function returns rows. + This uses the first `n` rows to determine the output schema + """ + def shift_and_fill(self, fill_value: int | str | float) -> DataFrame: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with this value. + n + Number of places to shift (may be negative). + """ + def take_every(self, n: int, offset: int = ...) -> DataFrame: + """ + Take every nth row in the DataFrame and return as a new DataFrame. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + def find_idx_by_name(self, name: str) -> int: + """ + Find the index of a column by name. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`get_column_index`. + + Parameters + ---------- + name + Name of the column to find. + """ + def insert_at_idx(self, index: int, column: Series) -> Self: + """ + Insert a Series at a certain column index. This operation is in place. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`insert_column`. + + Parameters + ---------- + index + Column to insert the new `Series` column. + column + `Series` to insert. + """ + def replace_at_idx(self, index: int, new_column: Series) -> Self: + """ + Replace a column at an index location. + + .. deprecated:: 0.19.14 + This method has been renamed to :func:`replace_column`. + + Parameters + ---------- + index + Column index. + new_column + Series that will replace the column. + """ + def frame_equal(self, other: DataFrame) -> bool: + """ + Check whether the DataFrame is equal to another DataFrame. + + .. deprecated:: 0.19.16 + This method has been renamed to :func:`equals`. + + Parameters + ---------- + other + DataFrame to compare with. + null_equal + Consider null values as equal. + """ + @property + def plot(self): ... + @property + def shape(self): ... + @property + def height(self): ... + @property + def width(self): ... + @property + def dtypes(self): ... + @property + def flags(self): ... + @property + def schema(self): ... +def _prepare_other_arg(other: Any, length: int | None = ...) -> Series: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.6/polars/expr/expr.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.6/polars/expr/expr.pyi new file mode 100644 index 0000000..ac8f6dc --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.6/polars/expr/expr.pyi @@ -0,0 +1,8420 @@ +#: version 0.20.6 +import P +import np as np +import pl +from builtins import PyExpr +from datetime import timedelta +from polars.datatypes.classes import Int64 as Int64 +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import _check_for_numpy as _check_for_numpy +from polars.exceptions import PolarsInefficientMapWarning as PolarsInefficientMapWarning +from polars.expr.array import ExprArrayNameSpace as ExprArrayNameSpace +from polars.expr.binary import ExprBinaryNameSpace as ExprBinaryNameSpace +from polars.expr.categorical import ExprCatNameSpace as ExprCatNameSpace +from polars.expr.datetime import ExprDateTimeNameSpace as ExprDateTimeNameSpace +from polars.expr.list import ExprListNameSpace as ExprListNameSpace +from polars.expr.meta import ExprMetaNameSpace as ExprMetaNameSpace +from polars.expr.name import ExprNameNameSpace as ExprNameNameSpace +from polars.expr.string import ExprStringNameSpace as ExprStringNameSpace +from polars.expr.struct import ExprStructNameSpace as ExprStructNameSpace +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions, parse_predicates_constraints_as_expression as parse_predicates_constraints_as_expression +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import threadpool_size as threadpool_size +from polars.utils.unstable import issue_unstable_warning as issue_unstable_warning, unstable as unstable +from polars.utils.various import no_default as no_default, sphinx_accessor as sphinx_accessor, warn_null_comparison as warn_null_comparison +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +py_arg_where: builtin_function_or_method +pyreduce: builtin_function_or_method + +class Expr: + class _map_batches_wrapper: + def __init__(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None) -> None: ... + def __call__(self, *args: Any, **kwargs: Any) -> Any: ... + _pyexpr: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + @classmethod + def _from_pyexpr(cls, pyexpr: PyExpr) -> Self: ... + def _repr_html_(self) -> str: ... + def __bool__(self) -> NoReturn: ... + def __abs__(self) -> Self: ... + def __add__(self, other: IntoExpr) -> Self: ... + def __radd__(self, other: IntoExpr) -> Self: ... + def __and__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __rand__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __eq__(self, other: IntoExpr) -> Self: ... + def __floordiv__(self, other: IntoExpr) -> Self: ... + def __rfloordiv__(self, other: IntoExpr) -> Self: ... + def __ge__(self, other: IntoExpr) -> Self: ... + def __gt__(self, other: IntoExpr) -> Self: ... + def __invert__(self) -> Self: ... + def __le__(self, other: IntoExpr) -> Self: ... + def __lt__(self, other: IntoExpr) -> Self: ... + def __mod__(self, other: IntoExpr) -> Self: ... + def __rmod__(self, other: IntoExpr) -> Self: ... + def __mul__(self, other: IntoExpr) -> Self: ... + def __rmul__(self, other: IntoExpr) -> Self: ... + def __ne__(self, other: IntoExpr) -> Self: ... + def __neg__(self) -> Self: ... + def __or__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __ror__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __pos__(self) -> Expr: ... + def __pow__(self, exponent: IntoExprColumn | int | float) -> Self: ... + def __rpow__(self, base: IntoExprColumn | int | float) -> Expr: ... + def __sub__(self, other: IntoExpr) -> Self: ... + def __rsub__(self, other: IntoExpr) -> Self: ... + def __truediv__(self, other: IntoExpr) -> Self: ... + def __rtruediv__(self, other: IntoExpr) -> Self: ... + def __xor__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __rxor__(self, other: IntoExprColumn | int | bool) -> Self: ... + def __array_ufunc__(self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any) -> Self: + """Numpy universal functions.""" + @classmethod + def from_json(cls, value: str) -> Self: + """ + Read an expression from a JSON encoded string to construct an Expression. + + Parameters + ---------- + value + JSON encoded string value + """ + def to_physical(self) -> Self: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + + Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.factorize + `_ + function. + + >>> pl.DataFrame({"vals": ["a", "x", None, "a"]}).with_columns( + ... [ + ... pl.col("vals").cast(pl.Categorical), + ... pl.col("vals") + ... .cast(pl.Categorical) + ... .to_physical() + ... .alias("vals_physical"), + ... ] + ... ) + shape: (4, 2) + ┌──────┬───────────────┐ + │ vals ┆ vals_physical │ + │ --- ┆ --- │ + │ cat ┆ u32 │ + ╞══════╪═══════════════╡ + │ a ┆ 0 │ + │ x ┆ 1 │ + │ null ┆ null │ + │ a ┆ 0 │ + └──────┴───────────────┘ + ''' + def any(self) -> Self: + ''' + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False], + ... "b": [False, False], + ... "c": [None, False], + ... } + ... ) + >>> df.select(pl.col("*").any()) + shape: (1, 3) + ┌──────┬───────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪═══════╡ + │ true ┆ false ┆ false │ + └──────┴───────┴───────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").any(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + ''' + def all(self) -> Self: + ''' + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + .. note:: + This method is not to be confused with the function :func:`polars.all`, + which can be used to select all columns. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is null. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, True], + ... "b": [False, True], + ... "c": [None, True], + ... } + ... ) + >>> df.select(pl.col("*").all()) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ true │ + └──────┴───────┴──────┘ + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> df.select(pl.col("*").all(ignore_nulls=False)) + shape: (1, 3) + ┌──────┬───────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞══════╪═══════╪══════╡ + │ true ┆ false ┆ null │ + └──────┴───────┴──────┘ + ''' + def arg_true(self) -> Self: + ''' + Return indices where expression evaluates `True`. + + .. warning:: + Modifies number of rows returned, so will fail in combination with other + expressions. Use as only expression in `select` / `with_columns`. + + See Also + -------- + Series.arg_true : Return indices where Series is True + polars.arg_where + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 1]}) + >>> df.select((pl.col("a") == 1).arg_true()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 3 │ + └─────┘ + ''' + def sqrt(self) -> Self: + ''' + Compute the square root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").sqrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.414214 │ + │ 2.0 │ + └──────────┘ + ''' + def cbrt(self) -> Self: + ''' + Compute the cube root of the elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").cbrt()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.259921 │ + │ 1.587401 │ + └──────────┘ + ''' + def log10(self) -> Self: + ''' + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").log10()) + shape: (3, 1) + ┌─────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞═════════╡ + │ 0.0 │ + │ 0.30103 │ + │ 0.60206 │ + └─────────┘ + ''' + def exp(self) -> Self: + ''' + Compute the exponential, element-wise. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1.0, 2.0, 4.0]}) + >>> df.select(pl.col("values").exp()) + shape: (3, 1) + ┌──────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 2.718282 │ + │ 7.389056 │ + │ 54.59815 │ + └──────────┘ + ''' + def alias(self, name: str) -> Self: + ''' + Rename the expression. + + Parameters + ---------- + name + The new name. + + See Also + -------- + map + prefix + suffix + + Examples + -------- + Rename an expression to avoid overwriting an existing column. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns( + ... pl.col("a") + 10, + ... pl.col("b").str.to_uppercase().alias("c"), + ... ) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 11 ┆ x ┆ X │ + │ 12 ┆ y ┆ Y │ + │ 13 ┆ z ┆ Z │ + └─────┴─────┴─────┘ + + Overwrite the default name of literal columns to prevent errors due to duplicate + column names. + + >>> df.with_columns( + ... pl.lit(True).alias("c"), + ... pl.lit(4.0).alias("d"), + ... ) + shape: (3, 4) + ┌─────┬─────┬──────┬─────┐ + │ a ┆ b ┆ c ┆ d │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ bool ┆ f64 │ + ╞═════╪═════╪══════╪═════╡ + │ 1 ┆ x ┆ true ┆ 4.0 │ + │ 2 ┆ y ┆ true ┆ 4.0 │ + │ 3 ┆ z ┆ true ┆ 4.0 │ + └─────┴─────┴──────┴─────┘ + ''' + def map_alias(self, function: Callable[[str], str]) -> Self: + ''' + Rename the output of an expression by mapping a function over the root name. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.map`. + + Parameters + ---------- + function + Function that maps a root name to a new name. + + See Also + -------- + keep_name + prefix + suffix + + Examples + -------- + Remove a common suffix and convert to lower case. + + >>> df = pl.DataFrame( + ... { + ... "A_reverse": [3, 2, 1], + ... "B_reverse": ["z", "y", "x"], + ... } + ... ) + >>> df.with_columns( + ... pl.all().reverse().name.map(lambda c: c.rstrip("_reverse").lower()) + ... ) + shape: (3, 4) + ┌───────────┬───────────┬─────┬─────┐ + │ A_reverse ┆ B_reverse ┆ a ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═══════════╪═══════════╪═════╪═════╡ + │ 3 ┆ z ┆ 1 ┆ x │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 1 ┆ x ┆ 3 ┆ z │ + └───────────┴───────────┴─────┴─────┘ + ''' + def prefix(self, prefix: str) -> Self: + ''' + Add a prefix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.prefix`. + + Parameters + ---------- + prefix + Prefix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + suffix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.prefix("reverse_")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ reverse_a ┆ reverse_b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + ''' + def suffix(self, suffix: str) -> Self: + ''' + Add a suffix to the root column name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.suffix`. + + Parameters + ---------- + suffix + Suffix to add to the root column name. + + Notes + ----- + This will undo any previous renaming operations on the expression. + + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + prefix + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["x", "y", "z"], + ... } + ... ) + >>> df.with_columns(pl.all().reverse().name.suffix("_reverse")) + shape: (3, 4) + ┌─────┬─────┬───────────┬───────────┐ + │ a ┆ b ┆ a_reverse ┆ b_reverse │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═════╪═══════════╪═══════════╡ + │ 1 ┆ x ┆ 3 ┆ z │ + │ 2 ┆ y ┆ 2 ┆ y │ + │ 3 ┆ z ┆ 1 ┆ x │ + └─────┴─────┴───────────┴───────────┘ + ''' + def keep_name(self) -> Self: + ''' + Keep the original root name of the expression. + + .. deprecated:: 0.19.12 + This method has been renamed to :func:`name.keep`. + + Notes + ----- + Due to implementation constraints, this method can only be called as the last + expression in a chain. + + See Also + -------- + alias + + Examples + -------- + Undo an alias operation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2], + ... "b": [3, 4], + ... } + ... ) + >>> df.with_columns((pl.col("a") * 9).alias("c").name.keep()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 3 │ + │ 18 ┆ 4 │ + └─────┴─────┘ + + Prevent errors due to duplicate column names. + + >>> df.select((pl.lit(10) / pl.all()).name.keep()) + shape: (2, 2) + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 10.0 ┆ 3.333333 │ + │ 5.0 ┆ 2.5 │ + └──────┴──────────┘ + ''' + def exclude(self, columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType], *more_columns: str | PolarsDataType) -> Self: + ''' + Exclude columns from a multi-column expression. + + Only works after a wildcard or regex column selection, and you cannot provide + both string column names *and* dtypes (you may prefer to use selectors instead). + + Parameters + ---------- + columns + The name or datatype of the column(s) to exclude. Accepts regular expression + input. Regular expressions should start with `^` and end with `$`. + *more_columns + Additional names or datatypes of columns to exclude, specified as positional + arguments. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "aa": [1, 2, 3], + ... "ba": ["a", "b", None], + ... "cc": [None, 2.5, 1.5], + ... } + ... ) + >>> df + shape: (3, 3) + ┌─────┬──────┬──────┐ + │ aa ┆ ba ┆ cc │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞═════╪══════╪══════╡ + │ 1 ┆ a ┆ null │ + │ 2 ┆ b ┆ 2.5 │ + │ 3 ┆ null ┆ 1.5 │ + └─────┴──────┴──────┘ + + Exclude by column name(s): + + >>> df.select(pl.all().exclude("ba")) + shape: (3, 2) + ┌─────┬──────┐ + │ aa ┆ cc │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ null │ + │ 2 ┆ 2.5 │ + │ 3 ┆ 1.5 │ + └─────┴──────┘ + + Exclude by regex, e.g. removing all columns whose names end with the letter "a": + + >>> df.select(pl.all().exclude("^.*a$")) + shape: (3, 1) + ┌──────┐ + │ cc │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ 2.5 │ + │ 1.5 │ + └──────┘ + + Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64: + + >>> df.select(pl.all().exclude([pl.Int64, pl.Float64])) + shape: (3, 1) + ┌──────┐ + │ ba │ + │ --- │ + │ str │ + ╞══════╡ + │ a │ + │ b │ + │ null │ + └──────┘ + ''' + def pipe(self, function: Callable[Concatenate[Expr, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the expression as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def extract_number(expr: pl.Expr) -> pl.Expr: + ... """Extract the digits from a string.""" + ... return expr.str.extract(r"\\d+", 0).cast(pl.Int64) + >>> + >>> def scale_negative_even(expr: pl.Expr, *, n: int = 1) -> pl.Expr: + ... """Set even numbers negative, and scale by a user-supplied value.""" + ... expr = pl.when(expr % 2 == 0).then(-expr).otherwise(expr) + ... return expr * n + >>> + >>> df = pl.DataFrame({"val": ["a: 1", "b: 2", "c: 3", "d: 4"]}) + >>> df.with_columns( + ... udfs=( + ... pl.col("val").pipe(extract_number).pipe(scale_negative_even, n=5) + ... ), + ... ) + shape: (4, 2) + ┌──────┬──────┐ + │ val ┆ udfs │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞══════╪══════╡ + │ a: 1 ┆ 5 │ + │ b: 2 ┆ -10 │ + │ c: 3 ┆ 15 │ + │ d: 4 ┆ -20 │ + └──────┴──────┘ + + ''' + def is_not(self) -> Self: + """ + Negate a boolean expression. + + .. deprecated:: 0.19.2 + This method has been renamed to :func:`Expr.not_`. + """ + def not_(self) -> Self: + ''' + Negate a boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [True, False, False], + ... "b": ["a", "b", None], + ... } + ... ) + >>> df + shape: (3, 2) + ┌───────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ bool ┆ str │ + ╞═══════╪══════╡ + │ true ┆ a │ + │ false ┆ b │ + │ false ┆ null │ + └───────┴──────┘ + >>> df.select(pl.col("a").not_()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ true │ + └───────┘ + ''' + def is_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.all().is_null().name.suffix("_isnull")) # nan != null + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_isnull ┆ b_isnull │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ true ┆ false │ + │ 1 ┆ 1.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + ''' + def is_not_null(self) -> Self: + ''' + Returns a boolean Series indicating which values are not null. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns( + ... pl.all().is_not_null().name.suffix("_not_null") # nan != null + ... ) + shape: (5, 4) + ┌──────┬─────┬────────────┬────────────┐ + │ a ┆ b ┆ a_not_null ┆ b_not_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪════════════╪════════════╡ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 2 ┆ 2.0 ┆ true ┆ true │ + │ null ┆ NaN ┆ false ┆ true │ + │ 1 ┆ 1.0 ┆ true ┆ true │ + │ 5 ┆ 5.0 ┆ true ┆ true │ + └──────┴─────┴────────────┴────────────┘ + ''' + def is_finite(self) -> Self: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_finite()) + shape: (2, 2) + ┌──────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞══════╪═══════╡ + │ true ┆ true │ + │ true ┆ false │ + └──────┴───────┘ + ''' + def is_infinite(self) -> Self: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1.0, 2], + ... "B": [3.0, float("inf")], + ... } + ... ) + >>> df.select(pl.all().is_infinite()) + shape: (2, 2) + ┌───────┬───────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ bool ┆ bool │ + ╞═══════╪═══════╡ + │ false ┆ false │ + │ false ┆ true │ + └───────┴───────┘ + ''' + def is_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_nan().name.suffix("_isnan")) + shape: (5, 3) + ┌──────┬─────┬─────────┐ + │ a ┆ b ┆ b_isnan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪═════════╡ + │ 1 ┆ 1.0 ┆ false │ + │ 2 ┆ 2.0 ┆ false │ + │ null ┆ NaN ┆ true │ + │ 1 ┆ 1.0 ┆ false │ + │ 5 ┆ 5.0 ┆ false │ + └──────┴─────┴─────────┘ + ''' + def is_not_nan(self) -> Self: + ''' + Returns a boolean Series indicating which values are not NaN. + + Notes + ----- + Floating point `NaN` (Not A Number) should not be confused + with missing data represented as `Null/None`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None, 1, 5], + ... "b": [1.0, 2.0, float("nan"), 1.0, 5.0], + ... } + ... ) + >>> df.with_columns(pl.col(pl.Float64).is_not_nan().name.suffix("_is_not_nan")) + shape: (5, 3) + ┌──────┬─────┬──────────────┐ + │ a ┆ b ┆ b_is_not_nan │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪═════╪══════════════╡ + │ 1 ┆ 1.0 ┆ true │ + │ 2 ┆ 2.0 ┆ true │ + │ null ┆ NaN ┆ false │ + │ 1 ┆ 1.0 ┆ true │ + │ 5 ┆ 5.0 ┆ true │ + └──────┴─────┴──────────────┘ + ''' + def agg_groups(self) -> Self: + ''' + Get the group indexes of the group by operation. + + Should be used in aggregation context only. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [94, 95, 96, 97, 97, 99], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").agg_groups()) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[u32] │ + ╞═══════╪═══════════╡ + │ one ┆ [0, 1, 2] │ + │ two ┆ [3, 4, 5] │ + └───────┴───────────┘ + ''' + def count(self) -> Self: + ''' + Return the number of non-null elements in the column. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + See Also + -------- + len + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) + >>> df.select(pl.all().count()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 2 │ + └─────┴─────┘ + ''' + def len(self) -> Self: + ''' + Return the number of elements in the column. + + Null values count towards the total. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + See Also + -------- + count + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + └─────┴─────┘ + ''' + def slice(self, offset: int | Expr, length: int | Expr | None = ...) -> Self: + ''' + Get a slice of this expression. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10, 11], + ... "b": [None, 4, 4, 4], + ... } + ... ) + >>> df.select(pl.all().slice(1, 2)) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 9 ┆ 4 │ + │ 10 ┆ 4 │ + └─────┴─────┘ + ''' + def append(self, other: IntoExpr) -> Self: + ''' + Append expressions. + + This is done by adding the chunks of `other` to this `Series`. + + Parameters + ---------- + other + Expression to append. + upcast + Cast both `Series` to the same supertype. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.all().head(1).append(pl.all().tail(1))) + shape: (2, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 8 ┆ null │ + │ 10 ┆ 4 │ + └─────┴──────┘ + ''' + def rechunk(self) -> Self: + ''' + Create a single chunk of memory for this Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + + Create a Series with 3 nulls, append column a then rechunk + + >>> df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()) + shape: (6, 1) + ┌────────┐ + │ repeat │ + │ --- │ + │ i64 │ + ╞════════╡ + │ null │ + │ null │ + │ null │ + │ 1 │ + │ 1 │ + │ 2 │ + └────────┘ + ''' + def drop_nulls(self) -> Self: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nulls()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 3.0 │ + │ NaN │ + └─────┘ + ''' + def drop_nans(self) -> Self: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0, None, 3.0, float("nan")]}) + >>> df.select(pl.col("a").drop_nans()) + shape: (3, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.0 │ + │ null │ + │ 3.0 │ + └──────┘ + ''' + def cum_sum(self) -> Self: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_sum().alias("cum_sum"), + ... pl.col("a").cum_sum(reverse=True).alias("cum_sum_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_sum ┆ cum_sum_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 10 │ + │ 2 ┆ 3 ┆ 9 │ + │ 3 ┆ 6 ┆ 7 │ + │ 4 ┆ 10 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_sum().alias("value_cum_sum"), + ... pl.col("values") + ... .cum_sum() + ... .forward_fill() + ... .alias("value_cum_sum_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬───────────────┬──────────────────────────┐ + │ values ┆ value_cum_sum ┆ value_cum_sum_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═══════════════╪══════════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 18 ┆ 18 │ + │ 9 ┆ 27 ┆ 27 │ + │ null ┆ null ┆ 27 │ + │ 16 ┆ 43 ┆ 43 │ + │ null ┆ null ┆ 43 │ + └────────┴───────────────┴──────────────────────────┘ + ''' + def cum_prod(self) -> Self: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_prod().alias("cum_prod"), + ... pl.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), + ... ) + shape: (4, 3) + ┌─────┬──────────┬──────────────────┐ + │ a ┆ cum_prod ┆ cum_prod_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════════╪══════════════════╡ + │ 1 ┆ 1 ┆ 24 │ + │ 2 ┆ 2 ┆ 24 │ + │ 3 ┆ 6 ┆ 12 │ + │ 4 ┆ 24 ┆ 4 │ + └─────┴──────────┴──────────────────┘ + ''' + def cum_min(self) -> Self: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_min().alias("cum_min"), + ... pl.col("a").cum_min(reverse=True).alias("cum_min_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_min ┆ cum_min_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 1 ┆ 2 │ + │ 3 ┆ 1 ┆ 3 │ + │ 4 ┆ 1 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + ''' + def cum_max(self) -> Self: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("a").cum_max().alias("cum_max"), + ... pl.col("a").cum_max(reverse=True).alias("cum_max_reverse"), + ... ) + shape: (4, 3) + ┌─────┬─────────┬─────────────────┐ + │ a ┆ cum_max ┆ cum_max_reverse │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════════╪═════════════════╡ + │ 1 ┆ 1 ┆ 4 │ + │ 2 ┆ 2 ┆ 4 │ + │ 3 ┆ 3 ┆ 4 │ + │ 4 ┆ 4 ┆ 4 │ + └─────┴─────────┴─────────────────┘ + + Null values are excluded, but can also be filled by calling `forward_fill`. + + >>> df = pl.DataFrame({"values": [None, 10, None, 8, 9, None, 16, None]}) + >>> df.with_columns( + ... pl.col("values").cum_max().alias("cum_max"), + ... pl.col("values").cum_max().forward_fill().alias("cum_max_all_filled"), + ... ) + shape: (8, 3) + ┌────────┬─────────┬────────────────────┐ + │ values ┆ cum_max ┆ cum_max_all_filled │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞════════╪═════════╪════════════════════╡ + │ null ┆ null ┆ null │ + │ 10 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 8 ┆ 10 ┆ 10 │ + │ 9 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ 10 │ + │ 16 ┆ 16 ┆ 16 │ + │ null ┆ null ┆ 16 │ + └────────┴─────────┴────────────────────┘ + ''' + def cum_count(self) -> Self: + ''' + Return the cumulative count of the non-null values in the column. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": ["x", "k", None, "d"]}) + >>> df.with_columns( + ... pl.col("a").cum_count().alias("cum_count"), + ... pl.col("a").cum_count(reverse=True).alias("cum_count_reverse"), + ... ) + shape: (4, 3) + ┌──────┬───────────┬───────────────────┐ + │ a ┆ cum_count ┆ cum_count_reverse │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u32 ┆ u32 │ + ╞══════╪═══════════╪═══════════════════╡ + │ x ┆ 1 ┆ 3 │ + │ k ┆ 2 ┆ 2 │ + │ null ┆ 2 ┆ 1 │ + │ d ┆ 3 ┆ 1 │ + └──────┴───────────┴───────────────────┘ + ''' + def floor(self) -> Self: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").floor()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + │ 0.0 │ + │ 1.0 │ + │ 1.0 │ + └─────┘ + ''' + def ceil(self) -> Self: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.3, 0.5, 1.0, 1.1]}) + >>> df.select(pl.col("a").ceil()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 1.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + ''' + def round(self, decimals: int = ...) -> Self: + ''' + Round underlying floating point data by `decimals` digits. + + Parameters + ---------- + decimals + Number of decimals to round by. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.33, 0.52, 1.02, 1.17]}) + >>> df.select(pl.col("a").round(1)) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.3 │ + │ 0.5 │ + │ 1.0 │ + │ 1.2 │ + └─────┘ + ''' + def round_sig_figs(self, digits: int) -> Self: + ''' + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.01234, 3.333, 1234.0]}) + >>> df.with_columns(pl.col("a").round_sig_figs(2).alias("round_sig_figs")) + shape: (3, 2) + ┌─────────┬────────────────┐ + │ a ┆ round_sig_figs │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════════╪════════════════╡ + │ 0.01234 ┆ 0.012 │ + │ 3.333 ┆ 3.3 │ + │ 1234.0 ┆ 1200.0 │ + └─────────┴────────────────┘ + ''' + def dot(self, other: Expr | str) -> Self: + ''' + Compute the dot/inner product between two Expressions. + + Parameters + ---------- + other + Expression to compute dot product with. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.select(pl.col("a").dot(pl.col("b"))) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 44 │ + └─────┘ + ''' + def mode(self) -> Self: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 1, 2, 3], + ... "b": [1, 1, 2, 2], + ... } + ... ) + >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 2 │ + └─────┴─────┘ + ''' + def cast(self, dtype: PolarsDataType | type[Any]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": ["4", "5", "6"], + ... } + ... ) + >>> df.with_columns( + ... [ + ... pl.col("a").cast(pl.Float64), + ... pl.col("b").cast(pl.Int32), + ... ] + ... ) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4 │ + │ 2.0 ┆ 5 │ + │ 3.0 ┆ 6 │ + └─────┴─────┘ + ''' + def sort(self) -> Self: + ''' + Sort this column. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3, 2], + ... } + ... ) + >>> df.select(pl.col("a").sort()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 1 │ + │ 2 │ + │ 3 │ + └──────┘ + >>> df.select(pl.col("a").sort(descending=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ null │ + │ 3 │ + │ 2 │ + │ 1 │ + └──────┘ + >>> df.select(pl.col("a").sort(nulls_last=True)) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ null │ + └──────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df = pl.DataFrame( + ... { + ... "group": ["one", "one", "one", "two", "two", "two"], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("value").sort()) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬────────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪════════════╡ + │ two ┆ [3, 4, 99] │ + │ one ┆ [1, 2, 98] │ + └───────┴────────────┘ + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Self: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("value").top_k().alias("top_k"), + ... pl.col("value").bottom_k().alias("bottom_k"), + ... ] + ... ) + shape: (5, 2) + ┌───────┬──────────┐ + │ top_k ┆ bottom_k │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═══════╪══════════╡ + │ 99 ┆ 1 │ + │ 98 ┆ 2 │ + │ 4 ┆ 3 │ + │ 3 ┆ 4 │ + │ 2 ┆ 98 │ + └───────┴──────────┘ + ''' + def arg_sort(self) -> Self: + ''' + Get the index values that would sort this column. + + Parameters + ---------- + descending + Sort in descending (descending) order. + nulls_last + Place null values last instead of first. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_sort()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 0 │ + │ 2 │ + └─────┘ + ''' + def arg_max(self) -> Self: + ''' + Get the index of the maximal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + ''' + def arg_min(self) -> Self: + ''' + Get the index of the minimal value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [20, 10, 30], + ... } + ... ) + >>> df.select(pl.col("a").arg_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + └─────┘ + ''' + def search_sorted(self, element: IntoExpr, side: SearchSortedSide = ...) -> Self: + ''' + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {\'any\', \'left\', \'right\'} + If \'any\', the index of the first suitable location found is given. + If \'left\', the index of the leftmost suitable location found is given. + If \'right\', return the rightmost suitable location found is given. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "values": [1, 2, 3, 5], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("values").search_sorted(0).alias("zero"), + ... pl.col("values").search_sorted(3).alias("three"), + ... pl.col("values").search_sorted(6).alias("six"), + ... ] + ... ) + shape: (1, 3) + ┌──────┬───────┬─────┐ + │ zero ┆ three ┆ six │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞══════╪═══════╪═════╡ + │ 0 ┆ 2 ┆ 4 │ + └──────┴───────┴─────┘ + ''' + def sort_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort this column by the ordering of other columns. + + When used in a projection/selection context, the whole column is sorted. + When used in a group by context, the groups are sorted. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "a", "b", "b"], + ... "value1": [1, 3, 4, 2], + ... "value2": [8, 7, 6, 5], + ... } + ... ) + >>> df.select(pl.col("group").sort_by("value1")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + Sorting by expressions is also supported. + + >>> df.select(pl.col("group").sort_by(pl.col("value1") + pl.col("value2"))) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ a │ + │ b │ + └───────┘ + + Sort by multiple columns by passing a list of columns. + + >>> df.select(pl.col("group").sort_by(["value1", "value2"], descending=True)) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ b │ + │ a │ + │ b │ + │ a │ + └───────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> df.select(pl.col("group").sort_by("value1", "value2")) + shape: (4, 1) + ┌───────┐ + │ group │ + │ --- │ + │ str │ + ╞═══════╡ + │ a │ + │ b │ + │ a │ + │ b │ + └───────┘ + + When sorting in a group by context, the groups are sorted. + + >>> df.group_by("group").agg( + ... pl.col("value1").sort_by("value2") + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value1 │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [3, 1] │ + │ b ┆ [2, 4] │ + └───────┴───────────┘ + + Take a single row from each group where a column attains its minimal value + within that group. + + >>> df.group_by("group").agg( + ... pl.all().sort_by("value2").first() + ... ) # doctest: +IGNORE_RESULT + shape: (2, 3) + ┌───────┬────────┬────────┐ + │ group ┆ value1 ┆ value2 | + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 | + ╞═══════╪════════╪════════╡ + │ a ┆ 3 ┆ 7 | + │ b ┆ 2 ┆ 5 | + └───────┴────────┴────────┘ + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + ''' + Take values by index. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + + Returns + ------- + Expr + Expression of the same data type. + + See Also + -------- + Expr.get : Take a single value + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg( + ... pl.col("value").gather([2, 1]) + ... ) + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ one ┆ [2, 98] │ + │ two ┆ [4, 99] │ + └───────┴───────────┘ + ''' + def get(self, index: int | Expr) -> Self: + ''' + Return a single value by index. + + Parameters + ---------- + index + An expression that leads to a UInt32 index. + + Returns + ------- + Expr + Expression of the same data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": [ + ... "one", + ... "one", + ... "one", + ... "two", + ... "two", + ... "two", + ... ], + ... "value": [1, 98, 2, 3, 99, 4], + ... } + ... ) + >>> df.group_by("group", maintain_order=True).agg(pl.col("value").get(1)) + shape: (2, 2) + ┌───────┬───────┐ + │ group ┆ value │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═══════╡ + │ one ┆ 98 │ + │ two ┆ 99 │ + └───────┴───────┘ + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> df = pl.DataFrame({"a": [1, 2, 3, 4]}) + >>> df.with_columns(shift=pl.col("a").shift()) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ null │ + │ 2 ┆ 1 │ + │ 3 ┆ 2 │ + │ 4 ┆ 3 │ + └─────┴───────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> df.with_columns(shift=pl.col("a").shift(-2)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ null │ + │ 4 ┆ null │ + └─────┴───────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> df.with_columns(shift=pl.col("a").shift(-2, fill_value=100)) + shape: (4, 2) + ┌─────┬───────┐ + │ a ┆ shift │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═══════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + │ 3 ┆ 100 │ + │ 4 ┆ 100 │ + └─────┴───────┘ + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + To interpolate over null values see interpolate. + See the examples below to fill nulls with an expression. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_null(strategy="zero")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 0 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(99)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 99 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(strategy="forward")) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ null ┆ 6 │ + └──────┴─────┘ + >>> df.with_columns(pl.col("b").fill_null(pl.col("b").median())) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪═════╡ + │ 1 ┆ 4.0 │ + │ 2 ┆ 5.0 │ + │ null ┆ 6.0 │ + └──────┴─────┘ + >>> df.with_columns(pl.all().fill_null(pl.all().median())) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ 2.0 ┆ 5.0 │ + │ 1.5 ┆ 6.0 │ + └─────┴─────┘ + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN value with a fill value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1.0, None, float("nan")], + ... "b": [4.0, float("nan"), 6], + ... } + ... ) + >>> df.with_columns(pl.col("b").fill_nan(0)) + shape: (3, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 4.0 │ + │ null ┆ 0.0 │ + │ NaN ┆ 6.0 │ + └──────┴─────┘ + ''' + def forward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the latest seen values. + + Parameters + ---------- + limit + The number of consecutive null values to forward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... } + ... ) + >>> df.select(pl.all().forward_fill()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 4 │ + │ 2 ┆ 6 │ + └─────┴─────┘ + ''' + def backward_fill(self, limit: int | None = ...) -> Self: + ''' + Fill missing values with the next to be seen values. + + Parameters + ---------- + limit + The number of consecutive null values to backward fill. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": [4, None, 6], + ... "c": [None, None, 2], + ... } + ... ) + >>> df.select(pl.all().backward_fill()) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 2 │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴─────┘ + >>> df.select(pl.all().backward_fill(limit=1)) + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ 1 ┆ 4 ┆ null │ + │ 2 ┆ 6 ┆ 2 │ + │ null ┆ 6 ┆ 2 │ + └──────┴─────┴──────┘ + ''' + def reverse(self) -> Self: + ''' + Reverse the selection. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [1, 2, 3, 4, 5], + ... "fruits": ["banana", "banana", "apple", "apple", "banana"], + ... "B": [5, 4, 3, 2, 1], + ... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.all(), + ... pl.all().reverse().name.suffix("_reverse"), + ... ] + ... ) + shape: (5, 8) + ┌─────┬────────┬─────┬────────┬───────────┬────────────────┬───────────┬──────────────┐ + │ A ┆ fruits ┆ B ┆ cars ┆ A_reverse ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════╪════════╪═══════════╪════════════════╪═══════════╪══════════════╡ + │ 1 ┆ banana ┆ 5 ┆ beetle ┆ 5 ┆ banana ┆ 1 ┆ beetle │ + │ 2 ┆ banana ┆ 4 ┆ audi ┆ 4 ┆ apple ┆ 2 ┆ beetle │ + │ 3 ┆ apple ┆ 3 ┆ beetle ┆ 3 ┆ apple ┆ 3 ┆ beetle │ + │ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 ┆ banana ┆ 4 ┆ audi │ + │ 5 ┆ banana ┆ 1 ┆ beetle ┆ 1 ┆ banana ┆ 5 ┆ beetle │ + └─────┴────────┴─────┴────────┴───────────┴────────────────┴───────────┴──────────────┘ + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Get standard deviation. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").std()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Get variance. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").var()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + ''' + def max(self) -> Self: + ''' + Get maximum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + ''' + def min(self) -> Self: + ''' + Get minimum value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]}) + >>> df.select(pl.col("a").min()) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ -1.0 │ + └──────┘ + ''' + def nan_max(self) -> Self: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_max()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + ''' + def nan_min(self) -> Self: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, float("nan")]}) + >>> df.select(pl.col("a").nan_min()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ NaN │ + └─────┘ + ''' + def sum(self) -> Self: + ''' + Get sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").sum()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 0 │ + └─────┘ + ''' + def mean(self) -> Self: + ''' + Get mean value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").mean()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + ''' + def median(self) -> Self: + ''' + Get median value using linear interpolation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 1]}) + >>> df.select(pl.col("a").median()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + ''' + def product(self) -> Self: + ''' + Compute the product of an expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").product()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + ''' + def n_unique(self) -> Self: + ''' + Count unique values. + + Notes + ----- + `null` is considered to be a unique value for the purposes of this operation. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 1, 2, 2, 3], "y": [1, 1, 1, None, None]}) + >>> df.select( + ... x_unique=pl.col("x").n_unique(), + ... y_unique=pl.col("y").n_unique(), + ... ) + shape: (1, 2) + ┌──────────┬──────────┐ + │ x_unique ┆ y_unique │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞══════════╪══════════╡ + │ 3 ┆ 2 │ + └──────────┴──────────┘ + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> df = pl.DataFrame({"n": [1, 1, 2]}) + >>> df.select(pl.col("n").approx_n_unique()) + shape: (1, 1) + ┌─────┐ + │ n │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + >>> df = pl.DataFrame({"n": range(1000)}) + >>> df.select( + ... exact=pl.col("n").n_unique(), + ... approx=pl.col("n").approx_n_unique(), + ... ) # doctest: +SKIP + shape: (1, 2) + ┌───────┬────────┐ + │ exact ┆ approx │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═══════╪════════╡ + │ 1000 ┆ 1005 │ + └───────┴────────┘ + ''' + def null_count(self) -> Self: + ''' + Count null values. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [None, 1, None], + ... "b": [10, None, 300], + ... "c": [350, 650, 850], + ... } + ... ) + >>> df.select(pl.all().null_count()) + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + ''' + def arg_unique(self) -> Self: + ''' + Get index of first unique value. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [8, 9, 10], + ... "b": [None, 4, 4], + ... } + ... ) + >>> df.select(pl.col("a").arg_unique()) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + │ 2 │ + └─────┘ + >>> df.select(pl.col("b").arg_unique()) + shape: (2, 1) + ┌─────┐ + │ b │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 0 │ + │ 1 │ + └─────┘ + ''' + def unique(self) -> Self: + ''' + Get unique values of this expression. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").unique()) # doctest: +IGNORE_RESULT + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + └─────┘ + >>> df.select(pl.col("a").unique(maintain_order=True)) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + └─────┘ + ''' + def first(self) -> Self: + ''' + Get the first value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").first()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + └─────┘ + ''' + def last(self) -> Self: + ''' + Get the last value. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").last()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + ''' + def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self: + ''' + Compute expressions over the given groups. + + This expression is similar to performing a group by aggregation and joining the + result back into the original DataFrame. + + The outcome is similar to how `window functions + `_ + work in PostgreSQL. + + Parameters + ---------- + expr + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_exprs + Additional columns to group by, specified as positional arguments. + mapping_strategy: {\'group_to_rows\', \'join\', \'explode\'} + - group_to_rows + If the aggregation results in multiple values, assign them back to their + position in the DataFrame. This can only be done if the group yields + the same elements before aggregation as after. + - join + Join the groups as \'List\' to the row positions. + warning: this can be memory intensive. + - explode + Don\'t do any mapping, but simply flatten the group. + This only makes sense if the input data is sorted. + + Examples + -------- + Pass the name of a column to compute the expression over that column. + + >>> df = pl.DataFrame( + ... { + ... "a": ["a", "a", "b", "b", "b"], + ... "b": [1, 2, 3, 5, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> df.with_columns( + ... pl.col("c").max().over("a").name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 5 │ + │ b ┆ 3 ┆ 3 ┆ 3 │ + │ b ┆ 5 ┆ 2 ┆ 3 │ + │ b ┆ 3 ┆ 1 ┆ 3 │ + └─────┴─────┴─────┴───────┘ + + Expression input is supported. + + >>> df.with_columns( + ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_max │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 4 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 4 │ + └─────┴─────┴─────┴───────┘ + + Group by multiple columns by passing a list of column names or expressions. + + >>> df.with_columns( + ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 2 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + + Or use positional arguments to group by multiple columns in the same way. + + >>> df.with_columns( + ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), + ... ) + shape: (5, 4) + ┌─────┬─────┬─────┬───────┐ + │ a ┆ b ┆ c ┆ c_min │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═══════╡ + │ a ┆ 1 ┆ 5 ┆ 5 │ + │ a ┆ 2 ┆ 4 ┆ 4 │ + │ b ┆ 3 ┆ 3 ┆ 1 │ + │ b ┆ 5 ┆ 2 ┆ 1 │ + │ b ┆ 3 ┆ 1 ┆ 1 │ + └─────┴─────┴─────┴───────┘ + ''' + def rolling(self, index_column: str) -> Self: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order. + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> df.with_columns( + ... sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + ... min_a=pl.min("a").rolling(index_column="dt", period="2d"), + ... max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ... ) + shape: (6, 5) + ┌─────────────────────┬─────┬───────┬───────┬───────┐ + │ dt ┆ a ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 7 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 5 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 9 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 2 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴─────┴───────┴───────┴───────┘ + ''' + def is_unique(self) -> Self: + ''' + Get mask of unique values. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_unique()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ true │ + └───────┘ + ''' + def is_first_distinct(self) -> Self: + ''' + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_first_distinct().alias("first")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ first │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ true │ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 2 ┆ false │ + └─────┴───────┘ + ''' + def is_last_distinct(self) -> Self: + ''' + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2, 3, 2]}) + >>> df.with_columns(pl.col("a").is_last_distinct().alias("last")) + shape: (5, 2) + ┌─────┬───────┐ + │ a ┆ last │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪═══════╡ + │ 1 ┆ false │ + │ 1 ┆ true │ + │ 2 ┆ false │ + │ 3 ┆ true │ + │ 2 ┆ true │ + └─────┴───────┘ + ''' + def is_duplicated(self) -> Self: + ''' + Return a boolean mask indicating duplicated values. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 1, 2]}) + >>> df.select(pl.col("a").is_duplicated()) + shape: (3, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ false │ + └───────┘ + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").peak_max()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ false │ + │ false │ + │ false │ + │ true │ + └───────┘ + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> df = pl.DataFrame({"a": [4, 1, 3, 2, 5]}) + >>> df.select(pl.col("a").peak_min()) + shape: (5, 1) + ┌───────┐ + │ a │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Get quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0, 1, 2, 3, 4, 5]}) + >>> df.select(pl.col("a").quantile(0.3)) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="higher")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="lower")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="midpoint")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + >>> df.select(pl.col("a").quantile(0.3, interpolation="linear")) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.5 │ + └─────┘ + ''' + def cut(self, breaks: Sequence[float]) -> Self: + ''' + Bin continuous values into discrete categories. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide a column into three categories. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], labels=["a", "b", "c"]).alias("cut") + ... ) + shape: (5, 2) + ┌─────┬─────┐ + │ foo ┆ cut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪═════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴─────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").cut([-1, 1], include_breaks=True).alias("cut") + ... ).unnest("cut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Self: + ''' + Bin continuous values into discrete categories based on their quantiles. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of categories. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + + Returns + ------- + Expr + Expression of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise an expression of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> df = pl.DataFrame({"foo": [-2, -1, 0, 1, 2]}) + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], labels=["a", "b", "c"]).alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ a │ + │ -1 ┆ a │ + │ 0 ┆ b │ + │ 1 ┆ b │ + │ 2 ┆ c │ + └─────┴──────┘ + + Divide a column into two categories using uniform quantile probabilities. + + >>> df.with_columns( + ... pl.col("foo") + ... .qcut(2, labels=["low", "high"], left_closed=True) + ... .alias("qcut") + ... ) + shape: (5, 2) + ┌─────┬──────┐ + │ foo ┆ qcut │ + │ --- ┆ --- │ + │ i64 ┆ cat │ + ╞═════╪══════╡ + │ -2 ┆ low │ + │ -1 ┆ low │ + │ 0 ┆ high │ + │ 1 ┆ high │ + │ 2 ┆ high │ + └─────┴──────┘ + + Add both the category and the breakpoint. + + >>> df.with_columns( + ... pl.col("foo").qcut([0.25, 0.75], include_breaks=True).alias("qcut") + ... ).unnest("qcut") + shape: (5, 3) + ┌─────┬──────┬────────────┐ + │ foo ┆ brk ┆ foo_bin │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪══════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴──────┴────────────┘ + ''' + def rle(self) -> Self: + ''' + Get the lengths and values of runs of identical values. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with Fields "lengths" and "values". + + See Also + -------- + rle_id + + Examples + -------- + >>> df = pl.DataFrame(pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3])) + >>> df.select(pl.col("s").rle()).unnest("s") + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Self: + ''' + Get a distinct integer ID for each run of identical values. + + The ID increases by one each time the value of a column (which can be a + :class:`Struct`) changes. + + This is especially useful when you want to define a new group for every time a + column\'s value changes, rather than for every distinct value of that column. + + See Also + -------- + rle + + Examples + -------- + >>> df = pl.DataFrame(dict(a=[1, 2, 1, 1, 1], b=["x", "x", None, "y", "y"])) + >>> # It works on structs of multiple values too! + >>> df.with_columns(a_r=pl.col("a").rle_id(), ab_r=pl.struct("a", "b").rle_id()) + shape: (5, 4) + ┌─────┬──────┬─────┬──────┐ + │ a ┆ b ┆ a_r ┆ ab_r │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ u32 ┆ u32 │ + ╞═════╪══════╪═════╪══════╡ + │ 1 ┆ x ┆ 0 ┆ 0 │ + │ 2 ┆ x ┆ 1 ┆ 1 │ + │ 1 ┆ null ┆ 2 ┆ 2 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + │ 1 ┆ y ┆ 2 ┆ 3 │ + └─────┴──────┴─────┴──────┘ + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn], **constraints: Any) -> Self: + ''' + Filter the expression based on one or more predicate expressions. + + The original order of the remaining elements is preserved. + + Mostly useful in an aggregation context. If you want to filter on a DataFrame + level, use `LazyFrame.filter`. + + Parameters + ---------- + predicates + Expression(s) that evaluates to a boolean Series. + constraints + Column filters; use `name = value` to filter columns by the supplied value. + Each constraint will behave the same as `pl.col(name).eq(value)`, and + will be implicitly joined with the other filter conditions using `&`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( + ... lt=pl.col("b").filter(pl.col("b") < 2).sum(), + ... gte=pl.col("b").filter(pl.col("b") >= 2).sum(), + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + + Filter expressions can also take constraints as keyword arguments. + + >>> import polars.selectors as cs + >>> df = pl.DataFrame( + ... { + ... "key": ["a", "a", "a", "a", "b", "b", "b", "b", "b"], + ... "n": [1, 2, 2, 3, 1, 3, 3, 2, 3], + ... }, + ... ) + >>> df.group_by("key").agg( + ... n_1=pl.col("n").filter(n=1).sum(), + ... n_2=pl.col("n").filter(n=2).sum(), + ... n_3=pl.col("n").filter(n=3).sum(), + ... ).sort(by="key") + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ key ┆ n_1 ┆ n_2 ┆ n_3 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ 1 ┆ 4 ┆ 3 │ + │ b ┆ 1 ┆ 2 ┆ 9 │ + └─────┴─────┴─────┴─────┘ + ''' + def where(self, predicate: Expr) -> Self: + ''' + Filter a single column. + + .. deprecated:: 0.20.4 + Use :func:`filter` instead. + + Alias for :func:`filter`. + + Parameters + ---------- + predicate + Boolean expression. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group_col": ["g1", "g1", "g2"], + ... "b": [1, 2, 3], + ... } + ... ) + >>> df.group_by("group_col").agg( # doctest: +SKIP + ... [ + ... pl.col("b").where(pl.col("b") < 2).sum().alias("lt"), + ... pl.col("b").where(pl.col("b") >= 2).sum().alias("gte"), + ... ] + ... ).sort("group_col") + shape: (2, 3) + ┌───────────┬─────┬─────┐ + │ group_col ┆ lt ┆ gte │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════════╪═════╪═════╡ + │ g1 ┆ 1 ┆ 2 │ + │ g2 ┆ 0 ┆ 3 │ + └───────────┴─────┴─────┘ + ''' + def map_batches(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Apply a custom python function to a whole Series or sequence of Series. + + The output of this custom function must be a Series (or a NumPy array, in which + case it will be automatically converted into a Series). If you want to apply a + custom function elementwise over single values, see :func:`map_elements`. + A reasonable use case for `map` functions is transforming the values + represented by an expression using a third-party library. + + .. warning:: + If you are looking to map a function over a window function or group_by + context, refer to :func:`map_elements` instead. + Read more in `the book + `_. + + Parameters + ---------- + function + Lambda/function to apply. + return_dtype + Dtype of the output Series. + is_elementwise + If set to true this can run in the streaming engine, but may yield + incorrect results in group-by. Ensure you know what you are doing! + agg_list + Aggregate the values of the expression into a list before applying the + function. This parameter only works in a group-by context. + The function will be invoked only once on a list of groups, rather than + once per group. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + See Also + -------- + map_elements + replace + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "sine": [0.0, 1.0, 0.0, -1.0], + ... "cosine": [1.0, 0.0, -1.0, 0.0], + ... } + ... ) + >>> df.select(pl.all().map_batches(lambda x: x.to_numpy().argmax())) + shape: (1, 2) + ┌──────┬────────┐ + │ sine ┆ cosine │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪════════╡ + │ 1 ┆ 0 │ + └──────┴────────┘ + + In a group-by context, the `agg_list` parameter can improve performance if used + correctly. The following example has `agg_list` set to `False`, which causes + the function to be applied once per group. The input of the function is a + Series of type `Int64`. This is less efficient. + + >>> df = pl.DataFrame( + ... { + ... "a": [0, 1, 0, 1], + ... "b": [1, 2, 3, 4], + ... } + ... ) + >>> df.group_by("a").agg( + ... pl.col("b").map_batches(lambda x: x.max(), agg_list=False) + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ list[i64] │ + ╞═════╪═══════════╡ + │ 1 ┆ [4] │ + │ 0 ┆ [3] │ + └─────┴───────────┘ + + Using `agg_list=True` would be more efficient. In this example, the input of + the function is a Series of type `List(Int64)`. + + >>> df.group_by("a").agg( + ... pl.col("b").map_batches(lambda x: x.list.max(), agg_list=True) + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴─────┘ + ''' + def map_elements(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) to each element of a column. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + The UDF is applied to each element of a column. Note that, in a GroupBy + context, the column will have been pre-aggregated and so each element + will itself be a Series. Therefore, depending on the context, + requirements for `function` differ: + + * Selection + Expects `function` to be of type `Callable[[Any], Any]`. + Applies a Python function to each individual value in the column. + * GroupBy + Expects `function` to be of type `Callable[[Series], Any]`. + For each group, applies a Python function to the slice of the column + corresponding to that group. + + Parameters + ---------- + function + Lambda/function to map. + return_dtype + Dtype of the output Series. + If not set, the dtype will be `pl.Unknown`. + skip_nulls + Don\'t map the function over values that contain nulls (this is faster). + pass_name + Pass the Series name to the custom function (this is more expensive). + strategy : {\'thread_local\', \'threading\'} + The threading strategy to use. + + - \'thread_local\': run the python function on a single thread. + - \'threading\': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + * Using `map_elements` is strongly discouraged as you will be effectively + running python "for" loops, which will be very slow. Wherever possible you + should prefer the native expression API to achieve the best performance. + + * If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + * Window function application using `over` is considered a GroupBy context + here, so `map_elements` can be used to map functions over window groups. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3, 1], + ... "b": ["a", "b", "c", "c"], + ... } + ... ) + + The function is applied to each element of column `\'a\'`: + + >>> df.with_columns( # doctest: +SKIP + ... pl.col("a").map_elements(lambda x: x * 2).alias("a_times_2"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────────┐ + │ a ┆ b ┆ a_times_2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ a ┆ 2 │ + │ 2 ┆ b ┆ 4 │ + │ 3 ┆ c ┆ 6 │ + │ 1 ┆ c ┆ 2 │ + └─────┴─────┴───────────┘ + + Tip: it is better to implement this with an expression: + + >>> df.with_columns( + ... (pl.col("a") * 2).alias("a_times_2"), + ... ) # doctest: +IGNORE_RESULT + + In a GroupBy context, each element of the column is itself a Series: + + >>> ( + ... df.lazy().group_by("b").agg(pl.col("a")).collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬───────────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [1] │ + │ b ┆ [2] │ + │ c ┆ [3, 1] │ + └─────┴───────────┘ + + Therefore, from the user\'s point-of-view, the function is applied per-group: + + >>> ( + ... df.lazy() + ... .group_by("b") + ... .agg(pl.col("a").map_elements(lambda x: x.sum())) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ b ┆ 2 │ + │ c ┆ 4 │ + └─────┴─────┘ + + Tip: again, it is better to implement this with an expression: + + >>> ( + ... df.lazy() + ... .group_by("b", maintain_order=True) + ... .agg(pl.col("a").sum()) + ... .collect() + ... ) # doctest: +IGNORE_RESULT + + Window function application using `over` will behave as a GroupBy + context, with your function receiving individual window groups: + + >>> df = pl.DataFrame( + ... { + ... "key": ["x", "x", "y", "x", "y", "z"], + ... "val": [1, 1, 1, 1, 1, 1], + ... } + ... ) + >>> df.with_columns( + ... scaled=pl.col("val").map_elements(lambda s: s * len(s)).over("key"), + ... ).sort("key") + shape: (6, 3) + ┌─────┬─────┬────────┐ + │ key ┆ val ┆ scaled │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ x ┆ 1 ┆ 3 │ + │ y ┆ 1 ┆ 2 │ + │ y ┆ 1 ┆ 2 │ + │ z ┆ 1 ┆ 1 │ + └─────┴─────┴────────┘ + + Note that this function would *also* be better-implemented natively: + + >>> df.with_columns( + ... scaled=(pl.col("val") * pl.col("val").count()).over("key"), + ... ).sort("key") # doctest: +IGNORE_RESULT + ''' + def flatten(self) -> Self: + ''' + Flatten a list or string column. + + Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b", "b"], + ... "values": [[1, 2], [2, 3], [4]], + ... } + ... ) + >>> df.group_by("group").agg(pl.col("values").flatten()) # doctest: +SKIP + shape: (2, 2) + ┌───────┬───────────┐ + │ group ┆ values │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═══════╪═══════════╡ + │ a ┆ [1, 2] │ + │ b ┆ [2, 3, 4] │ + └───────┴───────────┘ + ''' + def explode(self) -> Self: + ''' + Explode a list expression. + + This means that every item is expanded to a new row. + + Returns + ------- + Expr + Expression with the data type of the list elements. + + See Also + -------- + Expr.list.explode : Explode a list column. + Expr.str.explode : Explode a string column. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "group": ["a", "b"], + ... "values": [ + ... [1, 2], + ... [3, 4], + ... ], + ... } + ... ) + >>> df.select(pl.col("values").explode()) + shape: (4, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + └────────┘ + ''' + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [4, 5, 6], + ... } + ... ) + >>> df.select(pl.all().implode()) + shape: (1, 2) + ┌───────────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ list[i64] ┆ list[i64] │ + ╞═══════════╪═══════════╡ + │ [1, 2, 3] ┆ [4, 5, 6] │ + └───────────┴───────────┘ + ''' + def gather_every(self, n: int, offset: int = ...) -> Self: + ''' + Take every nth value in the Series and return as a new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").gather_every(3)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 4 │ + │ 7 │ + └─────┘ + + >>> df.select(pl.col("foo").gather_every(3, offset=1)) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 5 │ + │ 8 │ + └─────┘ + ''' + def head(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.head(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + ''' + def tail(self, n: int | Expr = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.tail(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + │ 6 │ + │ 7 │ + └─────┘ + ''' + def limit(self, n: int | Expr = ...) -> Self: + ''' + Get the first `n` rows (alias for :func:`Expr.head`). + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) + >>> df.limit(3) + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + ''' + def and_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "and" operator `expr & other & ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") >= pl.col("z")) + ... .and_( + ... pl.col("y") >= pl.col("z"), + ... pl.col("y") == pl.col("y"), + ... pl.col("z") <= pl.col("x"), + ... pl.col("y") != pl.col("x"), + ... ) + ... .alias("all") + ... ) + shape: (5, 1) + ┌───────┐ + │ all │ + │ --- │ + │ bool │ + ╞═══════╡ + │ true │ + │ true │ + │ true │ + │ false │ + │ false │ + └───────┘ + ''' + def or_(self, *others: Any) -> Self: + ''' + Method equivalent of bitwise "or" operator `expr | other | ...`. + + Parameters + ---------- + *others + One or more integer or boolean expressions to evaluate/combine. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5, 6, 7, 4, 8], + ... "y": [1.5, 2.5, 1.0, 4.0, -5.75], + ... "z": [-9, 2, -1, 4, 8], + ... } + ... ) + >>> df.select( + ... (pl.col("x") == pl.col("y")) + ... .or_( + ... pl.col("x") == pl.col("y"), + ... pl.col("y") == pl.col("z"), + ... pl.col("y").cast(int) == pl.col("z"), + ... ) + ... .alias("any") + ... ) + shape: (5, 1) + ┌───────┐ + │ any │ + │ --- │ + │ bool │ + ╞═══════╡ + │ false │ + │ true │ + │ false │ + │ true │ + │ false │ + └───────┘ + ''' + def eq(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x == y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x == y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ false │ + │ 2.0 ┆ 2.0 ┆ true │ + │ NaN ┆ NaN ┆ true │ + │ 4.0 ┆ 4.0 ┆ true │ + └─────┴─────┴────────┘ + ''' + def eq_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr == other` where `None == None`. + + This differs from default `eq` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").eq(pl.col("y")).alias("x eq y"), + ... pl.col("x").eq_missing(pl.col("y")).alias("x eq_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x eq y ┆ x eq_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ false ┆ false │ + │ 2.0 ┆ 2.0 ┆ true ┆ true │ + │ NaN ┆ NaN ┆ true ┆ true │ + │ 4.0 ┆ 4.0 ┆ true ┆ true │ + │ null ┆ 5.0 ┆ null ┆ false │ + │ null ┆ null ┆ null ┆ true │ + └──────┴──────┴────────┴────────────────┘ + ''' + def ge(self, other: Any) -> Self: + ''' + Method equivalent of "greater than or equal" operator `expr >= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ge(pl.col("y")).alias("x >= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x >= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ true │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴────────┘ + ''' + def gt(self, other: Any) -> Self: + ''' + Method equivalent of "greater than" operator `expr > other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 2.0], + ... "y": [5.0, 3.0, float("nan"), 1.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").gt(pl.col("y")).alias("x > y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x > y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 5.0 ┆ 5.0 ┆ false │ + │ 4.0 ┆ 3.0 ┆ true │ + │ NaN ┆ NaN ┆ false │ + │ 2.0 ┆ 1.0 ┆ true │ + └─────┴─────┴───────┘ + ''' + def le(self, other: Any) -> Self: + ''' + Method equivalent of "less than or equal" operator `expr <= other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [5.0, 4.0, float("nan"), 0.5], + ... "y": [5.0, 3.5, float("nan"), 2.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").le(pl.col("y")).alias("x <= y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x <= y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 5.0 ┆ 5.0 ┆ true │ + │ 4.0 ┆ 3.5 ┆ false │ + │ NaN ┆ NaN ┆ true │ + │ 0.5 ┆ 2.0 ┆ true │ + └─────┴─────┴────────┘ + ''' + def lt(self, other: Any) -> Self: + ''' + Method equivalent of "less than" operator `expr < other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 3.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").lt(pl.col("y")).alias("x < y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬───────┐ + │ x ┆ y ┆ x < y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 3.0 ┆ 4.0 ┆ true │ + └─────┴─────┴───────┘ + ''' + def ne(self, other: Any) -> Self: + ''' + Method equivalent of inequality operator `expr != other`. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0], + ... "y": [2.0, 2.0, float("nan"), 4.0], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x != y"), + ... ) + shape: (4, 3) + ┌─────┬─────┬────────┐ + │ x ┆ y ┆ x != y │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪═════╪════════╡ + │ 1.0 ┆ 2.0 ┆ true │ + │ 2.0 ┆ 2.0 ┆ false │ + │ NaN ┆ NaN ┆ false │ + │ 4.0 ┆ 4.0 ┆ false │ + └─────┴─────┴────────┘ + ''' + def ne_missing(self, other: Any) -> Self: + ''' + Method equivalent of equality operator `expr != other` where `None == None`. + + This differs from default `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + Examples + -------- + >>> df = pl.DataFrame( + ... data={ + ... "x": [1.0, 2.0, float("nan"), 4.0, None, None], + ... "y": [2.0, 2.0, float("nan"), 4.0, 5.0, None], + ... } + ... ) + >>> df.with_columns( + ... pl.col("x").ne(pl.col("y")).alias("x ne y"), + ... pl.col("x").ne_missing(pl.col("y")).alias("x ne_missing y"), + ... ) + shape: (6, 4) + ┌──────┬──────┬────────┬────────────────┐ + │ x ┆ y ┆ x ne y ┆ x ne_missing y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪════════╪════════════════╡ + │ 1.0 ┆ 2.0 ┆ true ┆ true │ + │ 2.0 ┆ 2.0 ┆ false ┆ false │ + │ NaN ┆ NaN ┆ false ┆ false │ + │ 4.0 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ 5.0 ┆ null ┆ true │ + │ null ┆ null ┆ null ┆ false │ + └──────┴──────┴────────┴────────────────┘ + ''' + def add(self, other: Any) -> Self: + ''' + Method equivalent of addition operator `expr + other`. + + Parameters + ---------- + other + numeric or string value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").add(2).alias("x+int"), + ... pl.col("x").add(pl.col("x").cum_prod()).alias("x+expr"), + ... ) + shape: (5, 3) + ┌─────┬───────┬────────┐ + │ x ┆ x+int ┆ x+expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═══════╪════════╡ + │ 1 ┆ 3 ┆ 2 │ + │ 2 ┆ 4 ┆ 4 │ + │ 3 ┆ 5 ┆ 9 │ + │ 4 ┆ 6 ┆ 28 │ + │ 5 ┆ 7 ┆ 125 │ + └─────┴───────┴────────┘ + + >>> df = pl.DataFrame( + ... {"x": ["a", "d", "g"], "y": ["b", "e", "h"], "z": ["c", "f", "i"]} + ... ) + >>> df.with_columns(pl.col("x").add(pl.col("y")).add(pl.col("z")).alias("xyz")) + shape: (3, 4) + ┌─────┬─────┬─────┬─────┐ + │ x ┆ y ┆ z ┆ xyz │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞═════╪═════╪═════╪═════╡ + │ a ┆ b ┆ c ┆ abc │ + │ d ┆ e ┆ f ┆ def │ + │ g ┆ h ┆ i ┆ ghi │ + └─────┴─────┴─────┴─────┘ + ''' + def floordiv(self, other: Any) -> Self: + ''' + Method equivalent of integer division operator `expr // other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + See Also + -------- + truediv + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 3, 4, 5]}) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").floordiv(2).alias("x//2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ x ┆ x/2 ┆ x//2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 0.5 ┆ 0 │ + │ 2 ┆ 1.0 ┆ 1 │ + │ 3 ┆ 1.5 ┆ 1 │ + │ 4 ┆ 2.0 ┆ 2 │ + │ 5 ┆ 2.5 ┆ 2 │ + └─────┴─────┴──────┘ + ''' + def mod(self, other: Any) -> Self: + ''' + Method equivalent of modulus operator `expr % other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns(pl.col("x").mod(2).alias("x%2")) + shape: (5, 2) + ┌─────┬─────┐ + │ x ┆ x%2 │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 0 │ + │ 1 ┆ 1 │ + │ 2 ┆ 0 │ + │ 3 ┆ 1 │ + │ 4 ┆ 0 │ + └─────┴─────┘ + ''' + def mul(self, other: Any) -> Self: + ''' + Method equivalent of multiplication operator `expr * other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8, 16]}) + >>> df.with_columns( + ... pl.col("x").mul(2).alias("x*2"), + ... pl.col("x").mul(pl.col("x").log(2)).alias("x * xlog2"), + ... ) + shape: (5, 3) + ┌─────┬─────┬───────────┐ + │ x ┆ x*2 ┆ x * xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═══════════╡ + │ 1 ┆ 2 ┆ 0.0 │ + │ 2 ┆ 4 ┆ 2.0 │ + │ 4 ┆ 8 ┆ 8.0 │ + │ 8 ┆ 16 ┆ 24.0 │ + │ 16 ┆ 32 ┆ 64.0 │ + └─────┴─────┴───────────┘ + ''' + def sub(self, other: Any) -> Self: + ''' + Method equivalent of subtraction operator `expr - other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [0, 1, 2, 3, 4]}) + >>> df.with_columns( + ... pl.col("x").sub(2).alias("x-2"), + ... pl.col("x").sub(pl.col("x").cum_sum()).alias("x-expr"), + ... ) + shape: (5, 3) + ┌─────┬─────┬────────┐ + │ x ┆ x-2 ┆ x-expr │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪════════╡ + │ 0 ┆ -2 ┆ 0 │ + │ 1 ┆ -1 ┆ 0 │ + │ 2 ┆ 0 ┆ -1 │ + │ 3 ┆ 1 ┆ -3 │ + │ 4 ┆ 2 ┆ -6 │ + └─────┴─────┴────────┘ + ''' + def neg(self) -> Self: + ''' + Method equivalent of unary minus operator `-expr`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-1, 0, 2, None]}) + >>> df.with_columns(pl.col("a").neg()) + shape: (4, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 1 │ + │ 0 │ + │ -2 │ + │ null │ + └──────┘ + ''' + def truediv(self, other: Any) -> Self: + ''' + Method equivalent of float division operator `expr / other`. + + Parameters + ---------- + other + Numeric literal or expression value. + + Notes + ----- + Zero-division behaviour follows IEEE-754: + + 0/0: Invalid operation - mathematically undefined, returns NaN. + n/0: On finite operands gives an exact infinite result, eg: ±infinity. + + See Also + -------- + floordiv + + Examples + -------- + >>> df = pl.DataFrame( + ... data={"x": [-2, -1, 0, 1, 2], "y": [0.5, 0.0, 0.0, -4.0, -0.5]} + ... ) + >>> df.with_columns( + ... pl.col("x").truediv(2).alias("x/2"), + ... pl.col("x").truediv(pl.col("y")).alias("x/y"), + ... ) + shape: (5, 4) + ┌─────┬──────┬──────┬───────┐ + │ x ┆ y ┆ x/2 ┆ x/y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 ┆ f64 │ + ╞═════╪══════╪══════╪═══════╡ + │ -2 ┆ 0.5 ┆ -1.0 ┆ -4.0 │ + │ -1 ┆ 0.0 ┆ -0.5 ┆ -inf │ + │ 0 ┆ 0.0 ┆ 0.0 ┆ NaN │ + │ 1 ┆ -4.0 ┆ 0.5 ┆ -0.25 │ + │ 2 ┆ -0.5 ┆ 1.0 ┆ -4.0 │ + └─────┴──────┴──────┴───────┘ + ''' + def pow(self, exponent: IntoExprColumn | int | float) -> Self: + ''' + Method equivalent of exponentiation operator `expr ** exponent`. + + Parameters + ---------- + exponent + Numeric literal or expression exponent value. + + Examples + -------- + >>> df = pl.DataFrame({"x": [1, 2, 4, 8]}) + >>> df.with_columns( + ... pl.col("x").pow(3).alias("cube"), + ... pl.col("x").pow(pl.col("x").log(2)).alias("x ** xlog2"), + ... ) + shape: (4, 3) + ┌─────┬───────┬────────────┐ + │ x ┆ cube ┆ x ** xlog2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═══════╪════════════╡ + │ 1 ┆ 1.0 ┆ 1.0 │ + │ 2 ┆ 8.0 ┆ 2.0 │ + │ 4 ┆ 64.0 ┆ 16.0 │ + │ 8 ┆ 512.0 ┆ 512.0 │ + └─────┴───────┴────────────┘ + ''' + def xor(self, other: Any) -> Self: + ''' + Method equivalent of bitwise exclusive-or operator `expr ^ other`. + + Parameters + ---------- + other + Integer or boolean value; accepts expression input. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"x": [True, False, True, False], "y": [True, True, False, False]} + ... ) + >>> df.with_columns(pl.col("x").xor(pl.col("y")).alias("x ^ y")) + shape: (4, 3) + ┌───────┬───────┬───────┐ + │ x ┆ y ┆ x ^ y │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ false ┆ false ┆ false │ + └───────┴───────┴───────┘ + + >>> def binary_string(n: int) -> str: + ... return bin(n)[2:].zfill(8) + >>> + >>> df = pl.DataFrame( + ... data={"x": [10, 8, 250, 66], "y": [1, 2, 3, 4]}, + ... schema={"x": pl.UInt8, "y": pl.UInt8}, + ... ) + >>> df.with_columns( + ... pl.col("x").map_elements(binary_string).alias("bin_x"), + ... pl.col("y").map_elements(binary_string).alias("bin_y"), + ... pl.col("x").xor(pl.col("y")).alias("xor_xy"), + ... pl.col("x") + ... .xor(pl.col("y")) + ... .map_elements(binary_string) + ... .alias("bin_xor_xy"), + ... ) + shape: (4, 6) + ┌─────┬─────┬──────────┬──────────┬────────┬────────────┐ + │ x ┆ y ┆ bin_x ┆ bin_y ┆ xor_xy ┆ bin_xor_xy │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ str ┆ str ┆ u8 ┆ str │ + ╞═════╪═════╪══════════╪══════════╪════════╪════════════╡ + │ 10 ┆ 1 ┆ 00001010 ┆ 00000001 ┆ 11 ┆ 00001011 │ + │ 8 ┆ 2 ┆ 00001000 ┆ 00000010 ┆ 10 ┆ 00001010 │ + │ 250 ┆ 3 ┆ 11111010 ┆ 00000011 ┆ 249 ┆ 11111001 │ + │ 66 ┆ 4 ┆ 01000010 ┆ 00000100 ┆ 70 ┆ 01000110 │ + └─────┴─────┴──────────┴──────────┴────────┴────────────┘ + ''' + def is_in(self, other: Expr | Collection[Any] | Series) -> Self: + ''' + Check if elements of this expression are present in the other Series. + + Parameters + ---------- + other + Series or sequence of primitive type. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"sets": [[1, 2, 3], [1, 2], [9, 10]], "optional_members": [1, 2, 3]} + ... ) + >>> df.with_columns(contains=pl.col("optional_members").is_in("sets")) + shape: (3, 3) + ┌───────────┬──────────────────┬──────────┐ + │ sets ┆ optional_members ┆ contains │ + │ --- ┆ --- ┆ --- │ + │ list[i64] ┆ i64 ┆ bool │ + ╞═══════════╪══════════════════╪══════════╡ + │ [1, 2, 3] ┆ 1 ┆ true │ + │ [1, 2] ┆ 2 ┆ true │ + │ [9, 10] ┆ 3 ┆ false │ + └───────────┴──────────────────┴──────────┘ + ''' + def repeat_by(self, by: pl.Series | Expr | str | int) -> Self: + ''' + Repeat the elements in this Series as specified in the given expression. + + The repeated elements are expanded into a `List`. + + Parameters + ---------- + by + Numeric column that determines how often the values will be repeated. + The column will be coerced to UInt32. Give this dtype to make the coercion a + no-op. + + Returns + ------- + Expr + Expression of data type :class:`List`, where the inner data type is equal + to the original data type. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": ["x", "y", "z"], + ... "n": [1, 2, 3], + ... } + ... ) + >>> df.select(pl.col("a").repeat_by("n")) + shape: (3, 1) + ┌─────────────────┐ + │ a │ + │ --- │ + │ list[str] │ + ╞═════════════════╡ + │ ["x"] │ + │ ["y", "y"] │ + │ ["z", "z", "z"] │ + └─────────────────┘ + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Self: + ''' + Check if this expression is between the given lower and upper bounds. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Strings are parsed as column + names, other non-expression inputs are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + + Examples + -------- + >>> df = pl.DataFrame({"num": [1, 2, 3, 4, 5]}) + >>> df.with_columns(pl.col("num").is_between(2, 4).alias("is_between")) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ true │ + │ 5 ┆ false │ + └─────┴────────────┘ + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> df.with_columns( + ... pl.col("num").is_between(2, 4, closed="left").alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ num ┆ is_between │ + │ --- ┆ --- │ + │ i64 ┆ bool │ + ╞═════╪════════════╡ + │ 1 ┆ false │ + │ 2 ┆ true │ + │ 3 ┆ true │ + │ 4 ┆ false │ + │ 5 ┆ false │ + └─────┴────────────┘ + + You can also use strings as well as numeric/temporal values (note: ensure that + string literals are wrapped with `lit` so as not to conflate them with + column names): + + >>> df = pl.DataFrame({"a": ["a", "b", "c", "d", "e"]}) + >>> df.with_columns( + ... pl.col("a") + ... .is_between(pl.lit("a"), pl.lit("c"), closed="both") + ... .alias("is_between") + ... ) + shape: (5, 2) + ┌─────┬────────────┐ + │ a ┆ is_between │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞═════╪════════════╡ + │ a ┆ true │ + │ b ┆ true │ + │ c ┆ true │ + │ d ┆ false │ + │ e ┆ false │ + └─────┴────────────┘ + ''' + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Self: + ''' + Hash the elements in the selection. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 2, None], + ... "b": ["x", None, "z"], + ... } + ... ) + >>> df.with_columns(pl.all().hash(10, 20, 30, 40)) # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌──────────────────────┬──────────────────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u64 ┆ u64 │ + ╞══════════════════════╪══════════════════════╡ + │ 9774092659964970114 ┆ 13614470193936745724 │ + │ 1101441246220388612 ┆ 11638928888656214026 │ + │ 11638928888656214026 ┆ 13382926553367784577 │ + └──────────────────────┴──────────────────────┘ + ''' + def reinterpret(self) -> Self: + ''' + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + + Examples + -------- + >>> s = pl.Series("a", [1, 1, 2], dtype=pl.UInt64) + >>> df = pl.DataFrame([s]) + >>> df.select( + ... [ + ... pl.col("a").reinterpret(signed=True).alias("reinterpreted"), + ... pl.col("a").alias("original"), + ... ] + ... ) + shape: (3, 2) + ┌───────────────┬──────────┐ + │ reinterpreted ┆ original │ + │ --- ┆ --- │ + │ i64 ┆ u64 │ + ╞═══════════════╪══════════╡ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 2 ┆ 2 │ + └───────────────┴──────────┘ + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Print the value that this expression evaluates to and pass on the value. + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 1, 2]}) + >>> df.select(pl.col("foo").cum_sum().inspect("value is: {}").alias("bar")) + value is: shape: (3,) + Series: \'foo\' [i64] + [ + 1 + 2 + 4 + ] + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 4 │ + └─────┘ + ''' + def interpolate(self, method: InterpolationMethod = ...) -> Self: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + Fill null values using linear interpolation. + + >>> df = pl.DataFrame( + ... { + ... "a": [1, None, 3], + ... "b": [1.0, float("nan"), 3.0], + ... } + ... ) + >>> df.select(pl.all().interpolate()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ NaN │ + │ 3.0 ┆ 3.0 │ + └─────┴─────┘ + + Fill null values using nearest interpolation. + + >>> df.select(pl.all().interpolate("nearest")) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 1.0 │ + │ 3 ┆ NaN │ + │ 3 ┆ 3.0 │ + └─────┴─────┘ + + Regrid data to a new grid. + + >>> df_original_grid = pl.DataFrame( + ... { + ... "grid_points": [1, 3, 10], + ... "values": [2.0, 6.0, 20.0], + ... } + ... ) # Interpolate from this to the new grid + >>> df_new_grid = pl.DataFrame({"grid_points": range(1, 11)}) + >>> df_new_grid.join( + ... df_original_grid, on="grid_points", how="left" + ... ).with_columns(pl.col("values").interpolate()) + shape: (10, 2) + ┌─────────────┬────────┐ + │ grid_points ┆ values │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════════════╪════════╡ + │ 1 ┆ 2.0 │ + │ 2 ┆ 4.0 │ + │ 3 ┆ 6.0 │ + │ 4 ┆ 8.0 │ + │ 5 ┆ 10.0 │ + │ 6 ┆ 12.0 │ + │ 7 ┆ 14.0 │ + │ 8 ┆ 16.0 │ + │ 9 ┆ 18.0 │ + │ 10 ┆ 20.0 │ + └─────────────┴────────┘ + ''' + def rolling_min(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling min (moving min) over the values in this array. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ 5.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.25 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.75 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ 1.25 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_min=pl.col("A").rolling_min(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_min │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 4.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("index").rolling_min( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 18 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └───────┴─────────────────────┴─────────────────┘ + ''' + def rolling_max(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling max (moving max) over the values in this array. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ 6.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.25 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ 3.75 │ + │ 6.0 ┆ 4.5 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_max=pl.col("A").rolling_max(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_max │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 4.0 │ + │ 4.0 ┆ 5.0 │ + │ 5.0 ┆ 6.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling max with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("index").rolling_max( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 19 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("index").rolling_max( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 4 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 20 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └───────┴─────────────────────┴─────────────────┘ + ''' + def rolling_mean(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴──────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴──────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_mean=pl.col("A").rolling_mean(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬──────────────┐ + │ A ┆ rolling_mean │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴──────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling mean with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("index").rolling_mean( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬──────────────────┐ + │ index ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 2.5 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 18.5 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └───────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("index").rolling_mean( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬──────────────────┐ + │ index ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 3.0 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 19.0 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └───────┴─────────────────────┴──────────────────┘ + ''' + def rolling_sum(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + of dtype `{Date, Datetime}` + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 3.0 │ + │ 3.0 ┆ 5.0 │ + │ 4.0 ┆ 7.0 │ + │ 5.0 ┆ 9.0 │ + │ 6.0 ┆ 11.0 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.75 │ + │ 3.0 ┆ 2.75 │ + │ 4.0 ┆ 3.75 │ + │ 5.0 ┆ 4.75 │ + │ 6.0 ┆ 5.75 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_sum=pl.col("A").rolling_sum(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_sum │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 6.0 │ + │ 3.0 ┆ 9.0 │ + │ 4.0 ┆ 12.0 │ + │ 5.0 ┆ 15.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling sum with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("index").rolling_sum( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 5 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 37 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("index").rolling_sum( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 9 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 57 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └───────┴─────────────────────┴─────────────────┘ + ''' + def rolling_std(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling standard deviation. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.707107 │ + │ 3.0 ┆ 0.707107 │ + │ 4.0 ┆ 0.707107 │ + │ 5.0 ┆ 0.707107 │ + │ 6.0 ┆ 0.707107 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.433013 │ + │ 3.0 ┆ 0.433013 │ + │ 4.0 ┆ 0.433013 │ + │ 5.0 ┆ 0.433013 │ + │ 6.0 ┆ 0.433013 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_std=pl.col("A").rolling_std(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_std │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling std with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("index").rolling_std( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 0.707107 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("index").rolling_std( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 1.0 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └───────┴─────────────────────┴─────────────────┘ + ''' + def rolling_var(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling variance. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.5 │ + │ 3.0 ┆ 0.5 │ + │ 4.0 ┆ 0.5 │ + │ 5.0 ┆ 0.5 │ + │ 6.0 ┆ 0.5 │ + └─────┴─────────────┘ + + Specify weights to multiply the values in the window with: + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 0.1875 │ + │ 3.0 ┆ 0.1875 │ + │ 4.0 ┆ 0.1875 │ + │ 5.0 ┆ 0.1875 │ + │ 6.0 ┆ 0.1875 │ + └─────┴─────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_var=pl.col("A").rolling_var(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬─────────────┐ + │ A ┆ rolling_var │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.0 │ + │ 3.0 ┆ 1.0 │ + │ 4.0 ┆ 1.0 │ + │ 5.0 ┆ 1.0 │ + │ 6.0 ┆ null │ + └─────┴─────────────┘ + + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling var with the default left closure of temporal windows + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("index").rolling_var( + ... window_size="2h", by="date", closed="left" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 0.5 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("index").rolling_var( + ... window_size="2h", by="date", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 1.0 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └───────┴─────────────────────┴─────────────────┘ + ''' + def rolling_median(self, window_size: int | timedelta | str, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling median. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + window_size + The length of the window. Can be a fixed integer size, or a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=2), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median( + ... window_size=2, weights=[0.25, 0.75] + ... ), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 1.5 │ + │ 3.0 ┆ 2.5 │ + │ 4.0 ┆ 3.5 │ + │ 5.0 ┆ 4.5 │ + │ 6.0 ┆ 5.5 │ + └─────┴────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_median=pl.col("A").rolling_median(window_size=3, center=True), + ... ) + shape: (6, 2) + ┌─────┬────────────────┐ + │ A ┆ rolling_median │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ 2.0 │ + │ 3.0 ┆ 3.0 │ + │ 4.0 ┆ 4.0 │ + │ 5.0 ┆ 5.0 │ + │ 6.0 ┆ null │ + └─────┴────────────────┘ + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int | timedelta | str = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a rolling quantile. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + If `by` has not been specified (the default), the window at a given row will + include the row itself, and the `window_size - 1` elements before it. + + If you pass a `by` column ``, then `closed="left"` + means the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + With `closed="right"`, the left endpoint is not included and the right + endpoint is included. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. Can be a fixed integer size, or a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + If a timedelta or the dynamic string language is used, the `by` + and `closed` arguments must also be set. + weights + An optional slice with the same length as the window that determines the + relative contribution of each value in a window to the output. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + by + If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must + set the column that will be used to determine the windows. This column must + be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive); only + applicable if `by` has been set. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column (if passed). + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + >>> df = pl.DataFrame({"A": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]}) + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4 + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights for the values in each window: + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, window_size=4, weights=[0.2, 0.4, 0.4, 0.2] + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 2.0 │ + │ 5.0 ┆ 3.0 │ + │ 6.0 ┆ 4.0 │ + └─────┴──────────────────┘ + + Specify weights and interpolation method + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.25, + ... window_size=4, + ... weights=[0.2, 0.4, 0.4, 0.2], + ... interpolation="linear", + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ null │ + │ 4.0 ┆ 1.625 │ + │ 5.0 ┆ 2.625 │ + │ 6.0 ┆ 3.625 │ + └─────┴──────────────────┘ + + Center the values in the window + + >>> df.with_columns( + ... rolling_quantile=pl.col("A").rolling_quantile( + ... quantile=0.2, window_size=5, center=True + ... ), + ... ) + shape: (6, 2) + ┌─────┬──────────────────┐ + │ A ┆ rolling_quantile │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════════════════╡ + │ 1.0 ┆ null │ + │ 2.0 ┆ null │ + │ 3.0 ┆ 2.0 │ + │ 4.0 ┆ 3.0 │ + │ 5.0 ┆ null │ + │ 6.0 ┆ null │ + └─────┴──────────────────┘ + ''' + def rolling_skew(self, window_size: int) -> Self: + ''' + Compute a rolling skew. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 4, 2, 9]}) + >>> df.select(pl.col("a").rolling_skew(3)) + shape: (4, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ null │ + │ null │ + │ 0.381802 │ + │ 0.47033 │ + └──────────┘ + + Note how the values match the following: + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + ''' + Compute a custom rolling window function. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window. + + Warnings + -------- + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Expr.rolling_sum` if at all possible. + + Examples + -------- + >>> from numpy import nansum + >>> df = pl.DataFrame({"a": [11.0, 2.0, 9.0, float("nan"), 8.0]}) + >>> df.select(pl.col("a").rolling_map(nansum, window_size=3)) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ null │ + │ null │ + │ 22.0 │ + │ 11.0 │ + │ 17.0 │ + └──────┘ + ''' + def abs(self) -> Self: + ''' + Compute absolute values. + + Same as `abs(expr)`. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "A": [-1.0, 0.0, 1.0, 2.0], + ... } + ... ) + >>> df.select(pl.col("A").abs()) + shape: (4, 1) + ┌─────┐ + │ A │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + │ 0.0 │ + │ 1.0 │ + │ 2.0 │ + └─────┘ + ''' + def rank(self, method: RankMethod = ...) -> Self: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The \'ordinal\' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use \'rank\' with \'over\' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Self: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> df = pl.DataFrame({"int": [20, 10, 30, 25, 35]}) + >>> df.with_columns(change=pl.col("int").diff()) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ -10 │ + │ 30 ┆ 20 │ + │ 25 ┆ -5 │ + │ 35 ┆ 10 │ + └─────┴────────┘ + + >>> df.with_columns(change=pl.col("int").diff(n=2)) + shape: (5, 2) + ┌─────┬────────┐ + │ int ┆ change │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪════════╡ + │ 20 ┆ null │ + │ 10 ┆ null │ + │ 30 ┆ 10 │ + │ 25 ┆ 15 │ + │ 35 ┆ 5 │ + └─────┴────────┘ + + >>> df.select(pl.col("int").diff(n=2, null_behavior="drop").alias("diff")) + shape: (3, 1) + ┌──────┐ + │ diff │ + │ --- │ + │ i64 │ + ╞══════╡ + │ 10 │ + │ 15 │ + │ 5 │ + └──────┘ + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [10, 11, 12, None, 12], + ... } + ... ) + >>> df.with_columns(pl.col("a").pct_change().alias("pct_change")) + shape: (5, 2) + ┌──────┬────────────┐ + │ a ┆ pct_change │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞══════╪════════════╡ + │ 10 ┆ null │ + │ 11 ┆ 0.1 │ + │ 12 ┆ 0.090909 │ + │ null ┆ 0.0 │ + │ 12 ┆ 0.0 │ + └──────┴────────────┘ + ''' + def skew(self) -> Self: + ''' + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").skew()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.343622 │ + └──────────┘ + ''' + def kurtosis(self) -> Self: + ''' + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher\'s definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators. + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher\'s definition is used (normal ==> 0.0). If False, + Pearson\'s definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").kurtosis()) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.153061 │ + └───────────┘ + ''' + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Self: + ''' + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> df = pl.DataFrame({"a": [-50, 5, 50, None]}) + >>> df.with_columns(clip=pl.col("a").clip(1, 10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ 1 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + + Specifying only a single bound: + + >>> df.with_columns(clip=pl.col("a").clip(upper_bound=10)) + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ clip │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ -50 ┆ -50 │ + │ 5 ┆ 5 │ + │ 50 ┆ 10 │ + │ null ┆ null │ + └──────┴──────┘ + ''' + def lower_bound(self) -> Self: + ''' + Calculate the lower bound. + + Returns a unit Series with the lowest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").lower_bound()) + shape: (1, 1) + ┌──────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════════════════════╡ + │ -9223372036854775808 │ + └──────────────────────┘ + ''' + def upper_bound(self) -> Self: + ''' + Calculate the upper bound. + + Returns a unit Series with the highest value possible for the dtype of this + expression. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3, 2, 1]}) + >>> df.select(pl.col("a").upper_bound()) + shape: (1, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════════════════════╡ + │ 9223372036854775807 │ + └─────────────────────┘ + ''' + def sign(self) -> Self: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> df = pl.DataFrame({"a": [-9.0, -0.0, 0.0, 4.0, None]}) + >>> df.select(pl.col("a").sign()) + shape: (5, 1) + ┌──────┐ + │ a │ + │ --- │ + │ i64 │ + ╞══════╡ + │ -1 │ + │ 0 │ + │ 0 │ + │ 1 │ + │ null │ + └──────┘ + ''' + def sin(self) -> Self: + ''' + Compute the element-wise value for the sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").sin()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + ''' + def cos(self) -> Self: + ''' + Compute the element-wise value for the cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").cos()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 1.0 │ + └─────┘ + ''' + def tan(self) -> Self: + ''' + Compute the element-wise value for the tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tan().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 1.56 │ + └──────┘ + ''' + def cot(self) -> Self: + ''' + Compute the element-wise value for the cotangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cot().round(2)) + shape: (1, 1) + ┌──────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════╡ + │ 0.64 │ + └──────┘ + ''' + def arcsin(self) -> Self: + ''' + Compute the element-wise value for the inverse sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsin()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + ''' + def arccos(self) -> Self: + ''' + Compute the element-wise value for the inverse cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [0.0]}) + >>> df.select(pl.col("a").arccos()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.570796 │ + └──────────┘ + ''' + def arctan(self) -> Self: + ''' + Compute the element-wise value for the inverse tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctan()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.785398 │ + └──────────┘ + ''' + def sinh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").sinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.175201 │ + └──────────┘ + ''' + def cosh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").cosh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.543081 │ + └──────────┘ + ''' + def tanh(self) -> Self: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").tanh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.761594 │ + └──────────┘ + ''' + def arcsinh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arcsinh()) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.881374 │ + └──────────┘ + ''' + def arccosh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arccosh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 0.0 │ + └─────┘ + ''' + def arctanh(self) -> Self: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1.0]}) + >>> df.select(pl.col("a").arctanh()) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ inf │ + └─────┘ + ''' + def degrees(self) -> Self: + ''' + Convert from radians to degrees. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> import math + >>> df = pl.DataFrame({"a": [x * math.pi for x in range(-4, 5)]}) + >>> df.select(pl.col("a").degrees()) + shape: (9, 1) + ┌────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════╡ + │ -720.0 │ + │ -540.0 │ + │ -360.0 │ + │ -180.0 │ + │ 0.0 │ + │ 180.0 │ + │ 360.0 │ + │ 540.0 │ + │ 720.0 │ + └────────┘ + ''' + def radians(self) -> Self: + ''' + Convert from degrees to radians. + + Returns + ------- + Expr + Expression of data type :class:`Float64`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [-720, -540, -360, -180, 0, 180, 360, 540, 720]}) + >>> df.select(pl.col("a").radians()) + shape: (9, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞════════════╡ + │ -12.566371 │ + │ -9.424778 │ + │ -6.283185 │ + │ -3.141593 │ + │ 0.0 │ + │ 3.141593 │ + │ 6.283185 │ + │ 9.424778 │ + │ 12.566371 │ + └────────────┘ + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Self: + ''' + Reshape this Expr to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Expr + If a single dimension is given, results in an expression of the original + data type. + If a multiple dimensions are given, results in an expression of data type + :class:`List` with shape (rows, cols). + + Examples + -------- + >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + >>> df.select(pl.col("foo").reshape((3, 3))) + shape: (3, 1) + ┌───────────┐ + │ foo │ + │ --- │ + │ list[i64] │ + ╞═══════════╡ + │ [1, 2, 3] │ + │ [4, 5, 6] │ + │ [7, 8, 9] │ + └───────────┘ + + See Also + -------- + Expr.list.explode : Explode a list column. + ''' + def shuffle(self, seed: int | None = ...) -> Self: + ''' + Shuffle the contents of this expression. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").shuffle(seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 3 │ + └─────┘ + ''' + def sample(self, n: int | IntoExprColumn | None = ...) -> Self: + ''' + Sample from this expression. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").sample(fraction=1.0, with_replacement=True, seed=1)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 1 │ + │ 1 │ + └─────┘ + ''' + def ewm_mean(self) -> Self: + ''' + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_mean(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.0 │ + │ 1.666667 │ + │ 2.428571 │ + └──────────┘ + ''' + def ewm_std(self) -> Self: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_std(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.707107 │ + │ 0.963624 │ + └──────────┘ + ''' + def ewm_var(self) -> Self: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").ewm_var(com=1)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 0.5 │ + │ 0.928571 │ + └──────────┘ + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Self: + ''' + Extremely fast method for extending the Series with \'n\' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend the + expression result Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select((pl.col("values") - 1).extend_constant(99, n=2)) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 0 │ + │ 1 │ + │ 2 │ + │ 99 │ + │ 99 │ + └────────┘ + ''' + def value_counts(self) -> Self: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + Expr + Expression of data type :class:`Struct` with mapping of unique values to + their count. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"color": ["red", "blue", "red", "green", "blue", "blue"]} + ... ) + >>> df.select(pl.col("color").value_counts()) # doctest: +IGNORE_RESULT + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"red",2} │ + │ {"green",1} │ + │ {"blue",3} │ + └─────────────┘ + + Sort the output by count. + + >>> df.select(pl.col("color").value_counts(sort=True)) + shape: (3, 1) + ┌─────────────┐ + │ color │ + │ --- │ + │ struct[2] │ + ╞═════════════╡ + │ {"blue",3} │ + │ {"red",2} │ + │ {"green",1} │ + └─────────────┘ + ''' + def unique_counts(self) -> Self: + ''' + Return a count of the unique values in the order of appearance. + + This method differs from `value_counts` in that it does not return the + values, only the counts and might be faster + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "id": ["a", "b", "b", "c", "c", "c"], + ... } + ... ) + >>> df.select( + ... [ + ... pl.col("id").unique_counts(), + ... ] + ... ) + shape: (3, 1) + ┌─────┐ + │ id │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + ''' + def log(self, base: float = ...) -> Self: + ''' + Compute the logarithm to a given base. + + Parameters + ---------- + base + Given base, defaults to `e` + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log(base=2)) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.0 │ + │ 1.0 │ + │ 1.584963 │ + └──────────┘ + ''' + def log1p(self) -> Self: + ''' + Compute the natural logarithm of each element plus one. + + This computes `log(1 + x)` but is more numerically stable for `x` close to zero. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").log1p()) + shape: (3, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 0.693147 │ + │ 1.098612 │ + │ 1.386294 │ + └──────────┘ + ''' + def entropy(self, base: float = ...) -> Self: + ''' + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn\'t sum to 1. + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 2, 3]}) + >>> df.select(pl.col("a").entropy(base=2)) + shape: (1, 1) + ┌──────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞══════════╡ + │ 1.459148 │ + └──────────┘ + >>> df.select(pl.col("a").entropy(base=2, normalize=False)) + shape: (1, 1) + ┌───────────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -6.754888 │ + └───────────┘ + ''' + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Self: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3, 4, 5]}) + >>> df.select( + ... [ + ... pl.col("values").cumulative_eval( + ... pl.element().first() - pl.element().last() ** 2 + ... ) + ... ] + ... ) + shape: (5, 1) + ┌────────┐ + │ values │ + │ --- │ + │ f64 │ + ╞════════╡ + │ 0.0 │ + │ -3.0 │ + │ -8.0 │ + │ -15.0 │ + │ -24.0 │ + └────────┘ + ''' + def set_sorted(self) -> Self: + ''' + Flags the expression as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + Whether the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> df = pl.DataFrame({"values": [1, 2, 3]}) + >>> df.select(pl.col("values").set_sorted().max()) + shape: (1, 1) + ┌────────┐ + │ values │ + │ --- │ + │ i64 │ + ╞════════╡ + │ 3 │ + └────────┘ + ''' + def shrink_dtype(self) -> Self: + ''' + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + + Examples + -------- + >>> pl.DataFrame( + ... { + ... "a": [1, 2, 3], + ... "b": [1, 2, 2 << 32], + ... "c": [-1, 2, 1 << 30], + ... "d": [-112, 2, 112], + ... "e": [-112, 2, 129], + ... "f": ["a", "b", "c"], + ... "g": [0.1, 1.32, 0.12], + ... "h": [True, None, False], + ... } + ... ).select(pl.all().shrink_dtype()) + shape: (3, 8) + ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │ + ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡ + │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │ + │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │ + │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │ + └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘ + ''' + def hist(self, bins: IntoExpr | None = ...) -> Self: + ''' + Bin values into buckets and count their occurrences. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + include_breakpoint + Include a column that indicates the upper breakpoint. + include_category + Include a column that shows the intervals as categories. + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = pl.DataFrame({"a": [1, 3, 8, 8, 2, 1, 3]}) + >>> df.select(pl.col("a").hist(bins=[1, 2, 3])) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + │ 1 │ + │ 2 │ + │ 2 │ + └─────┘ + >>> df.select( + ... pl.col("a").hist( + ... bins=[1, 2, 3], include_breakpoint=True, include_category=True + ... ) + ... ) + shape: (4, 1) + ┌───────────────────────┐ + │ a │ + │ --- │ + │ struct[3] │ + ╞═══════════════════════╡ + │ {1.0,"(-inf, 1.0]",2} │ + │ {2.0,"(1.0, 2.0]",1} │ + │ {3.0,"(2.0, 3.0]",2} │ + │ {inf,"(3.0, inf]",2} │ + └───────────────────────┘ + ''' + def replace(self, old: IntoExpr | Sequence[Any] | Mapping[Any, Any], new: IntoExpr | Sequence[Any] | NoDefault = ...) -> Self: + ''' + Replace values by different values. + + Parameters + ---------- + old + Value or sequence of values to replace. + Accepts expression input. Sequences are parsed as Series, + other non-expression inputs are parsed as literals. + Also accepts a mapping of values to their replacement as syntactic sugar for + `replace(new=Series(mapping.keys()), old=Series(mapping.values()))`. + new + Value or sequence of values to replace by. + Accepts expression input. Sequences are parsed as Series, + other non-expression inputs are parsed as literals. + Length must match the length of `old` or have length 1. + default + Set values that were not replaced to this value. + Defaults to keeping the original value. + Accepts expression input. Non-expression inputs are parsed as literals. + return_dtype + The data type of the resulting expression. If set to `None` (default), + the data type is determined automatically based on the other inputs. + + See Also + -------- + str.replace + + Notes + ----- + The global string cache must be enabled when replacing categorical values. + + Examples + -------- + Replace a single value by another value. Values that were not replaced remain + unchanged. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3]}) + >>> df.with_columns(replaced=pl.col("a").replace(2, 100)) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 3 │ + └─────┴──────────┘ + + Replace multiple values by passing sequences to the `old` and `new` parameters. + + >>> df.with_columns(replaced=pl.col("a").replace([2, 3], [100, 200])) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 200 │ + └─────┴──────────┘ + + Passing a mapping with replacements is also supported as syntactic sugar. + Specify a default to set all values that were not matched. + + >>> mapping = {2: 100, 3: 200} + >>> df.with_columns(replaced=pl.col("a").replace(mapping, default=-1)) + shape: (4, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════════╡ + │ 1 ┆ -1 │ + │ 2 ┆ 100 │ + │ 2 ┆ 100 │ + │ 3 ┆ 200 │ + └─────┴──────────┘ + + Replacing by values of a different data type sets the return type based on + a combination of the `new` data type and either the original data type or the + default data type if it was set. + + >>> df = pl.DataFrame({"a": ["x", "y", "z"]}) + >>> mapping = {"x": 1, "y": 2, "z": 3} + >>> df.with_columns(replaced=pl.col("a").replace(mapping)) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + >>> df.with_columns(replaced=pl.col("a").replace(mapping, default=None)) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + + Set the `return_dtype` parameter to control the resulting data type directly. + + >>> df.with_columns( + ... replaced=pl.col("a").replace(mapping, return_dtype=pl.UInt8) + ... ) + shape: (3, 2) + ┌─────┬──────────┐ + │ a ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ u8 │ + ╞═════╪══════════╡ + │ x ┆ 1 │ + │ y ┆ 2 │ + │ z ┆ 3 │ + └─────┴──────────┘ + + Expression input is supported for all parameters. + + >>> df = pl.DataFrame({"a": [1, 2, 2, 3], "b": [1.5, 2.5, 5.0, 1.0]}) + >>> df.with_columns( + ... replaced=pl.col("a").replace( + ... old=pl.col("a").max(), + ... new=pl.col("b").sum(), + ... default=pl.col("b"), + ... ) + ... ) + shape: (4, 3) + ┌─────┬─────┬──────────┐ + │ a ┆ b ┆ replaced │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪═════╪══════════╡ + │ 1 ┆ 1.5 ┆ 1.5 │ + │ 2 ┆ 2.5 ┆ 2.5 │ + │ 2 ┆ 5.0 ┆ 5.0 │ + │ 3 ┆ 1.0 ┆ 10.0 │ + └─────┴─────┴──────────┘ + ''' + def map(self, function: Callable[[Series], Series | Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom python function to a Series or sequence of Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + agg_list + Aggregate list + """ + def apply(self, function: Callable[[Series], Series] | Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) in a GroupBy or Projection context. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.map_elements`. + + Parameters + ---------- + function + Lambda/ function to apply. + return_dtype + Dtype of the output Series. + If not set, the dtype will be + `polars.Unknown`. + skip_nulls + Don't apply the function over values + that contain nulls. This is faster. + pass_name + Pass the Series name to the custom function + This is more expensive. + strategy : {'thread_local', 'threading'} + This functionality is in `alpha` stage. This may be removed + /changed without it being considered a breaking change. + + - 'thread_local': run the python function on a single thread. + - 'threading': run the python function on separate threads. Use with + care as this can slow performance. This might only speed up + your code if the amount of work per element is significant + and the python function releases the GIL (e.g. via calling + a c function) + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Self: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Expr.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + """ + def is_first(self) -> Self: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_first_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + """ + def is_last(self) -> Self: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Expr.is_last_distinct`. + + Returns + ------- + Expr + Expression of data type :class:`Boolean`. + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Self: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + """ + def shift_and_fill(self, fill_value: IntoExpr) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + """ + def register_plugin(self) -> Self: + """ + Register a shared library as a plugin. + + .. warning:: + This is highly unsafe as this will call the C function + loaded by `lib::symbol`. + + The parameters you give dictate how polars will deal + with the function. Make sure they are correct! + + .. note:: + This functionality is unstable and may change without it + being considered breaking. + + Parameters + ---------- + lib + Library to load. + symbol + Function to load. + args + Arguments (other than self) passed to this function. + These arguments have to be of type Expression. + kwargs + Non-expression arguments. They must be JSON serializable. + is_elementwise + If the function only operates on scalars + this will trigger fast paths. + input_wildcard_expansion + Expand expressions as input of this function. + returns_scalar + Automatically explode on unit length if it ran as final aggregation. + this is the case for aggregations like `sum`, `min`, `covariance` etc. + cast_to_supertypes + Cast the input datatypes to their supertype. + pass_name_to_apply + if set, then the `Series` passed to the function in the group_by operation + will ensure the name is set. This is an extra heap allocation per group. + changes_length + For example a `unique` or a `slice` + """ + def _register_plugin(self) -> Self: ... + def take_every(self, n: int, offset: int = ...) -> Self: + """ + Take every nth value in the Series and return as a new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Self: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + An expression that leads to a UInt32 dtyped Series. + """ + def cumsum(self) -> Self: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumprod(self) -> Self: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummin(self) -> Self: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cummax(self) -> Self: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def cumcount(self) -> Self: + """ + Get an array with the cumulative count computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_count`. + + Parameters + ---------- + reverse + Reverse the operation. + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in column according to remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Accepts expression input. Non-expression inputs are parsed as literals. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + """ + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def meta(self): ... + @property + def name(self): ... + @property + def str(self): ... + @property + def struct(self): ... +def _prepare_alpha(com: float | int | None = ..., span: float | int | None = ..., half_life: float | int | None = ..., alpha: float | int | None = ...) -> float: + """Normalise EWM decay specification in terms of smoothing factor 'alpha'.""" +def _prepare_rolling_window_args(window_size: int | timedelta | str, min_periods: int | None = ...) -> tuple[str, int]: ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.6/polars/lazyframe/frame.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.6/polars/lazyframe/frame.pyi new file mode 100644 index 0000000..a0d0746 --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.6/polars/lazyframe/frame.pyi @@ -0,0 +1,4354 @@ +#: version 0.20.6 +import P +import np +import pa +from builtins import PyLazyFrame +from pathlib import Path +from polars.convert import from_dict as from_dict +from polars.datatypes.classes import Boolean as Boolean, Categorical as Categorical, DataTypeGroup as DataTypeGroup, Date as Date, Datetime as Datetime, Duration as Duration, Enum as Enum, Float32 as Float32, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, Null as Null, Object as Object, String as String, Time as Time, UInt16 as UInt16, UInt32 as UInt32, UInt64 as UInt64, UInt8 as UInt8, Unknown as Unknown +from polars.datatypes.convert import is_polars_dtype as is_polars_dtype, py_type_to_dtype as py_type_to_dtype +from polars.dependencies import dataframe_api_compat as dataframe_api_compat +from polars.io._utils import _is_local_file as _is_local_file, _is_supported_cloud as _is_supported_cloud +from polars.io.csv._utils import _check_arg_is_1byte as _check_arg_is_1byte +from polars.io.ipc.anonymous_scan import _scan_ipc_fsspec as _scan_ipc_fsspec +from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec as _scan_parquet_fsspec +from polars.lazyframe.group_by import LazyGroupBy as LazyGroupBy +from polars.lazyframe.in_process import InProcessQuery as InProcessQuery +from polars.selectors import _expand_selectors as _expand_selectors, by_dtype as by_dtype, expand_selector as expand_selector +from polars.slice import LazyPolarsSlice as LazyPolarsSlice +from polars.utils._async import _AioDataFrameResult as _AioDataFrameResult, _GeventDataFrameResult as _GeventDataFrameResult +from polars.utils._parse_expr_input import parse_as_expression as parse_as_expression, parse_as_list_of_expressions as parse_as_list_of_expressions +from polars.utils._wrap import wrap_df as wrap_df, wrap_expr as wrap_expr +from polars.utils.convert import _negate_duration as _negate_duration, _timedelta_to_pl_duration as _timedelta_to_pl_duration +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_parameter_as_positional as deprecate_parameter_as_positional, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, deprecate_saturating as deprecate_saturating, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.unstable import issue_unstable_warning as issue_unstable_warning, unstable as unstable +from polars.utils.various import _in_notebook as _in_notebook, _prepare_row_index_args as _prepare_row_index_args, _process_null_values as _process_null_values, is_bool_sequence as is_bool_sequence, is_sequence as is_sequence, normalize_filepath as normalize_filepath, parse_percentiles as parse_percentiles +from typing import Any, Callable, ClassVar as _ClassVar, Collection, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +DTYPE_TEMPORAL_UNITS: frozenset +N_INFER_DEFAULT: int + +class LazyFrame: + _accessors: _ClassVar[set] = ... + def __init__(self, data: FrameInitTypes | None = ..., schema: SchemaDefinition | None = ...) -> None: ... + @classmethod + def _from_pyldf(cls, ldf: PyLazyFrame) -> Self: ... + @classmethod + def _scan_csv(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a CSV file or multiple files via glob patterns. + + Use `pl.scan_csv` to dispatch to this method. + + See Also + -------- + polars.io.scan_csv + """ + @classmethod + def _scan_parquet(cls, source: str | list[str] | list[Path]) -> Self: + """ + Lazily read from a parquet file or multiple files via glob patterns. + + Use `pl.scan_parquet` to dispatch to this method. + + See Also + -------- + polars.io.scan_parquet + """ + @classmethod + def _scan_ipc(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from an Arrow IPC (Feather v2) file. + + Use `pl.scan_ipc` to dispatch to this method. + + See Also + -------- + polars.io.scan_ipc + """ + @classmethod + def _scan_ndjson(cls, source: str | Path | list[str] | list[Path]) -> Self: + """ + Lazily read from a newline delimited JSON file. + + Use `pl.scan_ndjson` to dispatch to this method. + + See Also + -------- + polars.io.scan_ndjson + """ + @classmethod + def _scan_python_function(cls, schema: pa.schema | Mapping[str, PolarsDataType], scan_fn: Any) -> Self: ... + @classmethod + def deserialize(cls, source: str | Path | IOBase) -> Self: + ''' + Read a logical plan from a JSON file to construct a LazyFrame. + + Parameters + ---------- + source + Path to a file or a file-like object (by file-like object, we refer to + objects that have a `read()` method, such as a file handler (e.g. + via builtin `open` function) or `BytesIO`). + + See Also + -------- + LazyFrame.serialize + + Examples + -------- + >>> import io + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + ''' + def __dataframe_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def __bool__(self) -> NoReturn: ... + def _comparison_error(self, operator: str) -> NoReturn: ... + def __eq__(self, other: Any) -> NoReturn: ... + def __ne__(self, other: Any) -> NoReturn: ... + def __gt__(self, other: Any) -> NoReturn: ... + def __lt__(self, other: Any) -> NoReturn: ... + def __ge__(self, other: Any) -> NoReturn: ... + def __le__(self, other: Any) -> NoReturn: ... + def __contains__(self, key: str) -> bool: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __getitem__(self, item: int | range | slice) -> LazyFrame: ... + def _repr_html_(self) -> str: ... + def serialize(self, file: IOBase | str | Path | None = ...) -> str | None: + ''' + Serialize the logical plan of this LazyFrame to a file or string in JSON format. + + Parameters + ---------- + file + File path to which the result should be written. If set to `None` + (default), the output is returned as a string instead. + + See Also + -------- + LazyFrame.deserialize + + Examples + -------- + Serialize the logical plan into a JSON string. + + >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() + >>> json = lf.serialize() + >>> json + \'{"Projection":{"expr":[{"Agg":{"Sum":{"Column":"a"}}}],"input":{"DataFrameScan":{"df":{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]}]},"schema":{"inner":{"a":"Int64"}},"output_schema":null,"projection":null,"selection":null}},"schema":{"inner":{"a":"Int64"}},"options":{"run_parallel":true,"duplicate_check":true}}}\' + + The logical plan can later be deserialized back into a LazyFrame. + + >>> import io + >>> pl.LazyFrame.deserialize(io.StringIO(json)).collect() + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + └─────┘ + ''' + def pipe(self, function: Callable[Concatenate[LazyFrame, P], T], *args: P.args, **kwargs: P.kwargs) -> T: + ''' + Offers a structured way to apply a sequence of user-defined functions (UDFs). + + Parameters + ---------- + function + Callable; will receive the frame as the first parameter, + followed by any given args/kwargs. + *args + Arguments to pass to the UDF. + **kwargs + Keyword arguments to pass to the UDF. + + Examples + -------- + >>> def cast_str_to_int(data, col_name): + ... return data.with_columns(pl.col(col_name).cast(pl.Int64)) + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": ["10", "20", "30", "40"], + ... } + ... ) + >>> lf.pipe(cast_str_to_int, col_name="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 10 │ + │ 2 ┆ 20 │ + │ 3 ┆ 30 │ + │ 4 ┆ 40 │ + └─────┴─────┘ + + >>> lf = pl.LazyFrame( + ... { + ... "b": [1, 2], + ... "a": [3, 4], + ... } + ... ) + >>> lf.collect() + shape: (2, 2) + ┌─────┬─────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 4 │ + └─────┴─────┘ + >>> lf.pipe(lambda tdf: tdf.select(sorted(tdf.columns))).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 1 │ + │ 4 ┆ 2 │ + └─────┴─────┘ + ''' + def describe(self, percentiles: Sequence[float] | float | None = ...) -> DataFrame: + ''' + Creates a summary of statistics for a LazyFrame, returning a DataFrame. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics. + All values must be in the range `[0, 1]`. + + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method used when calculating percentiles. + + Returns + ------- + DataFrame + + Notes + ----- + The median is included by default as the 50% percentile. + + Warnings + -------- + * This method does *not* maintain the laziness of the frame, and will `collect` + the final result. This could potentially be an expensive operation. + * We do not guarantee the output of `describe` to be stable. It will show + statistics that we deem informative, and may be updated in the future. + Using `describe` programmatically (versus interactive exploration) is + not recommended for this reason. + + Examples + -------- + >>> from datetime import date, time + >>> lf = pl.LazyFrame( + ... { + ... "float": [1.0, 2.8, 3.0], + ... "int": [40, 50, None], + ... "bool": [True, False, True], + ... "str": ["zz", "xx", "yy"], + ... "date": [date(2020, 1, 1), date(2021, 7, 5), date(2022, 12, 31)], + ... "time": [time(10, 20, 30), time(14, 45, 50), time(23, 15, 10)], + ... } + ... ) + + Show default frame statistics: + + >>> lf.describe() + shape: (9, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐ + │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │ + │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ + │ 25% ┆ 2.8 ┆ 40.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 50% ┆ 2.8 ┆ 50.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 75% ┆ 3.0 ┆ 50.0 ┆ null ┆ null ┆ 2022-12-31 ┆ 23:15:10 │ + │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ + └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘ + + Customize which percentiles are displayed, applying linear interpolation: + + >>> lf.describe( + ... percentiles=[0.1, 0.3, 0.5, 0.7, 0.9], + ... interpolation="linear", + ... ) + shape: (11, 7) + ┌────────────┬──────────┬──────────┬──────────┬──────┬────────────┬──────────┐ + │ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞════════════╪══════════╪══════════╪══════════╪══════╪════════════╪══════════╡ + │ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │ + │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │ + │ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 ┆ 16:07:10 │ + │ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │ + │ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │ + │ 10% ┆ 1.36 ┆ 41.0 ┆ null ┆ null ┆ 2020-04-20 ┆ 11:13:34 │ + │ 30% ┆ 2.08 ┆ 43.0 ┆ null ┆ null ┆ 2020-11-26 ┆ 12:59:42 │ + │ 50% ┆ 2.8 ┆ 45.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │ + │ 70% ┆ 2.88 ┆ 47.0 ┆ null ┆ null ┆ 2022-02-07 ┆ 18:09:34 │ + │ 90% ┆ 2.96 ┆ 49.0 ┆ null ┆ null ┆ 2022-09-13 ┆ 21:33:18 │ + │ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │ + └────────────┴──────────┴──────────┴──────────┴──────┴────────────┴──────────┘ + ''' + def explain(self) -> str: + ''' + Create a string representation of the query plan. + + Different optimizations can be turned on or off. + + Parameters + ---------- + optimized + Return an optimized query plan. Defaults to `True`. + If this is set to `True` the subsequent + optimization flags control which optimizations + run. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).explain() # doctest: +SKIP + ''' + def show_graph(self) -> str | None: + ''' + Show a plot of the query plan. Note that you should have graphviz installed. + + Parameters + ---------- + optimized + Optimize the query plan. + show + Show the figure. + output_path + Write the figure to disk. + raw_output + Return dot syntax. This cannot be combined with `show` and/or `output_path`. + figsize + Passed to matplotlib if `show` == True. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).show_graph() # doctest: +SKIP + ''' + def inspect(self, fmt: str = ...) -> Self: + ''' + Inspect a node in the computation graph. + + Print the value that this node in the computation graph evaluates to and pass on + the value. + + Examples + -------- + >>> lf = pl.LazyFrame({"foo": [1, 1, -2, 3]}) + >>> ( + ... lf.with_columns(pl.col("foo").cum_sum().alias("bar")) + ... .inspect() # print the node before the filter + ... .filter(pl.col("bar") == pl.col("foo")) + ... ) # doctest: +ELLIPSIS + + ''' + def sort(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> Self: + ''' + Sort the LazyFrame by the given columns. + + Parameters + ---------- + by + Column(s) to sort by. Accepts expression input. Strings are parsed as column + names. + *more_by + Additional columns to sort by, specified as positional arguments. + descending + Sort in descending order. When sorting by multiple columns, can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + Examples + -------- + Pass a single column name to sort by that column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None], + ... "b": [6.0, 5.0, 4.0], + ... "c": ["a", "c", "b"], + ... } + ... ) + >>> lf.sort("a").collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + + Sorting by expressions is also supported. + + >>> lf.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + └──────┴─────┴─────┘ + + Sort by multiple columns by passing a list of columns. + + >>> lf.sort(["c", "a"], descending=True).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 2 ┆ 5.0 ┆ c │ + │ null ┆ 4.0 ┆ b │ + │ 1 ┆ 6.0 ┆ a │ + └──────┴─────┴─────┘ + + Or use positional arguments to sort by multiple columns in the same way. + + >>> lf.sort("c", "a", descending=[False, True]).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞══════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ null ┆ 4.0 ┆ b │ + │ 2 ┆ 5.0 ┆ c │ + └──────┴─────┴─────┘ + ''' + def top_k(self, k: int) -> Self: + ''' + Return the `k` largest elements. + + If `descending=True` the smallest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the `k` smallest. Top-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might + be worse since this requires a stable search. + + See Also + -------- + bottom_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 largest values in column b. + + >>> lf.top_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ a ┆ 2 │ + │ b ┆ 2 │ + │ b ┆ 1 │ + └─────┴─────┘ + + Get the rows which contain the 4 largest values when sorting on column b and a. + + >>> lf.top_k(4, by=["b", "a"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 2 │ + │ c ┆ 1 │ + └─────┴─────┘ + ''' + def bottom_k(self, k: int) -> Self: + ''' + Return the `k` smallest elements. + + If `descending=True` the largest elements will be given. + + Parameters + ---------- + k + Number of rows to return. + by + Column(s) included in sort order. Accepts expression input. + Strings are parsed as column names. + descending + Return the `k` largest. Bottom-k by multiple columns can be specified + per column by passing a sequence of booleans. + nulls_last + Place null values last. + maintain_order + Whether the order should be maintained if elements are equal. + Note that if `true` streaming is not possible and performance might be + worse since this requires a stable search. + + See Also + -------- + top_k + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [2, 1, 1, 3, 2, 1], + ... } + ... ) + + Get the rows which contain the 4 smallest values in column b. + + >>> lf.bottom_k(4, by="b").collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ b ┆ 1 │ + │ a ┆ 1 │ + │ c ┆ 1 │ + │ a ┆ 2 │ + └─────┴─────┘ + + Get the rows which contain the 4 smallest values when sorting on column a and b. + + >>> lf.bottom_k(4, by=["a", "b"]).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ b ┆ 1 │ + │ b ┆ 2 │ + └─────┴─────┘ + ''' + def profile(self) -> tuple[DataFrame, DataFrame]: + ''' + Profile a LazyFrame. + + This will run the query and return a tuple + containing the materialized DataFrame and a DataFrame that + contains profiling information of each node that is executed. + + The units of the timings are microseconds. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + show_plot + Show a gantt chart of the profiling result + truncate_nodes + Truncate the label lengths in the gantt chart to this number of + characters. + figsize + matplotlib figsize of the profiling plot + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort( + ... "a" + ... ).profile() # doctest: +SKIP + (shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘, + shape: (3, 3) + ┌─────────────────────────┬───────┬──────┐ + │ node ┆ start ┆ end │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u64 ┆ u64 │ + ╞═════════════════════════╪═══════╪══════╡ + │ optimization ┆ 0 ┆ 5 │ + │ group_by_partitioned(a) ┆ 5 ┆ 470 │ + │ sort(a) ┆ 475 ┆ 1964 │ + └─────────────────────────┴───────┴──────┘) + ''' + def collect(self) -> DataFrame | InProcessQuery: + ''' + Materialize this LazyFrame into a DataFrame. + + By default, all query optimizations are enabled. Individual optimizations may + be disabled by setting the corresponding parameter to `False`. + + Parameters + ---------- + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + no_optimization + Turn off (certain) optimizations. + streaming + Process the query in batches to handle larger-than-memory data. + If set to `False` (default), the entire query is processed in a single + batch. + + .. warning:: + Streaming mode is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + background + Run the query in the background and get a handle to the query. + This handle can be used to fetch the result or cancel the query. + + Returns + ------- + DataFrame + + See Also + -------- + fetch: Run the query on the first `n` rows only for debugging purposes. + explain : Print the query plan that is evaluated with collect. + profile : Collect the LazyFrame and time each node in the computation graph. + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.Config.set_streaming_chunk_size : Set the size of streaming batches. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + + Collect in streaming mode + + >>> lf.group_by("a").agg(pl.all().sum()).collect( + ... streaming=True + ... ) # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def collect_async(self) -> Awaitable[DataFrame] | _GeventDataFrameResult[DataFrame]: + ''' + Collect DataFrame asynchronously in thread pool. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Collects into a DataFrame (like :func:`collect`), but instead of returning + DataFrame directly, they are scheduled to be collected inside thread pool, + while this method returns almost instantly. + + May be useful if you use gevent or asyncio and want to release control to other + greenlets/tasks while LazyFrames are being collected. + + Parameters + ---------- + gevent + Return wrapper to `gevent.event.AsyncResult` instead of Awaitable + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off (certain) optimizations. + slice_pushdown + Slice pushdown optimization. + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Process the query in batches to handle larger-than-memory data. + If set to `False` (default), the entire query is processed in a single + batch. + + .. warning:: + Streaming mode is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + .. note:: + Use :func:`explain` to see if Polars can process the query in streaming + mode. + + Returns + ------- + If `gevent=False` (default) then returns awaitable. + + If `gevent=True` then returns wrapper that has + `.get(block=True, timeout=None)` method. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + + Notes + ----- + In case of error `set_exception` is used on + `asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them. + + Examples + -------- + >>> import asyncio + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> async def main(): + ... return await ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect_async() + ... ) + >>> asyncio.run(main()) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def sink_parquet(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a Parquet file. + + .. warning:: + Streaming mode is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'uncompressed\', \'snappy\', \'gzip\', \'lzo\', \'brotli\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + statistics + Write statistics to the parquet headers. This requires extra compute. + row_group_size + Size of the row groups in number of rows. + If None (default), the chunks of the `DataFrame` are + used. Writing in smaller chunks may reduce memory pressure and improve + writing speeds. + data_pagesize_limit + Size limit of individual data pages. + If not set defaults to 1024 * 1024 bytes + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_parquet("out.parquet") # doctest: +SKIP + ''' + def sink_ipc(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an IPC file. + + .. warning:: + Streaming mode is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + compression : {\'lz4\', \'zstd\'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ipc("out.arrow") # doctest: +SKIP + ''' + def sink_csv(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to a CSV file. + + .. warning:: + Streaming mode is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + include_bom + Whether to include UTF-8 BOM in the CSV output. + include_header + Whether to include header in the CSV output. + separator + Separate CSV fields with this symbol. + line_terminator + String used to end each row. + quote_char + Byte to use as quoting character. + batch_size + Number of rows that will be processed per thread. + datetime_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. If no format specified, the default fractional-second + precision is inferred from the maximum timeunit found in the frame\'s + Datetime cols (if any). + date_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + time_format + A format string, with the specifiers defined by the + `chrono `_ + Rust crate. + float_precision + Number of decimal places to write, applied to both `Float32` and + `Float64` datatypes. + null_value + A string representing null values (defaulting to the empty string). + quote_style : {\'necessary\', \'always\', \'non_numeric\', \'never\'} + Determines the quoting strategy used. + + - necessary (default): This puts quotes around fields only when necessary. + They are necessary when fields contain a quote, + delimiter or record terminator. + Quotes are also necessary when writing an empty record + (which is indistinguishable from a record with one empty field). + This is the default. + - always: This puts quotes around every field. Always. + - never: This never puts quotes around fields, even if that results in + invalid CSV data (e.g.: by not quoting strings containing the + separator). + - non_numeric: This puts quotes around all fields that are non-numeric. + Namely, when writing a field that does not parse as a valid float + or integer, then quotes will be used even if they aren`t strictly + necessary. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_csv("out.csv") # doctest: +SKIP + ''' + def sink_ndjson(self, path: str | Path) -> DataFrame: + ''' + Evaluate the query in streaming mode and write to an NDJSON file. + + .. warning:: + Streaming mode is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + This allows streaming results that are larger than RAM to be written to disk. + + Parameters + ---------- + path + File path to which the file should be written. + maintain_order + Maintain the order in which data is processed. + Setting this to `False` will be slightly faster. + type_coercion + Do type coercion optimization. + predicate_pushdown + Do predicate pushdown optimization. + projection_pushdown + Do projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + slice_pushdown + Slice pushdown optimization. + no_optimization + Turn off (certain) optimizations. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.scan_csv("/path/to/my_larger_than_ram_file.csv") # doctest: +SKIP + >>> lf.sink_ndjson("out.ndjson") # doctest: +SKIP + ''' + def _set_sink_optimizations(self) -> PyLazyFrame: ... + def fetch(self, n_rows: int = ...) -> DataFrame: + ''' + Collect a small number of rows for debugging purposes. + + Parameters + ---------- + n_rows + Collect n_rows from the data sources. + type_coercion + Run type coercion optimization. + predicate_pushdown + Run predicate pushdown optimization. + projection_pushdown + Run projection pushdown optimization. + simplify_expression + Run simplify expressions optimization. + no_optimization + Turn off optimizations. + slice_pushdown + Slice pushdown optimization + comm_subplan_elim + Will try to cache branching subplans that occur on self-joins or unions. + comm_subexpr_elim + Common subexpressions will be cached and reused. + streaming + Run parts of the query in a streaming fashion (this is in an alpha state) + + Notes + ----- + This is similar to a :func:`collect` operation, but it overwrites the number of + rows read by *every* scan operation. Be aware that `fetch` does not guarantee + the final number of rows in the DataFrame. Filters, join operations and fewer + rows being available in the scanned data will all influence the final number + of rows (joins are especially susceptible to this, and may return no data + at all if `n_rows` is too small as the join keys may not be present). + + Warnings + -------- + This is strictly a utility function that can help to debug queries using a + smaller number of rows, and should *not* be used in production code. + + Returns + ------- + DataFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).fetch(2) + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 6 │ + │ b ┆ 2 ┆ 5 │ + └─────┴─────┴─────┘ + ''' + def lazy(self) -> Self: + ''' + Return lazy representation, i.e. itself. + + Useful for writing code that expects either a :class:`DataFrame` or + :class:`LazyFrame`. + + Returns + ------- + LazyFrame + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.lazy() # doctest: +ELLIPSIS + + ''' + def cache(self) -> Self: + """Cache the result once the execution of the physical plan hits this node.""" + def cast(self, dtypes: Mapping[ColumnNameOrSelector | PolarsDataType, PolarsDataType] | PolarsDataType) -> Self: + ''' + Cast LazyFrame column(s) to the specified dtype(s). + + Parameters + ---------- + dtypes + Mapping of column names (or selector) to dtypes, or a single dtype + to which all columns will be cast. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> from datetime import date + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)], + ... } + ... ) + + Cast specific frame columns to the specified dtypes: + + >>> lf.cast({"foo": pl.Float32, "bar": pl.UInt8}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ f32 ┆ u8 ┆ date │ + ╞═════╪═════╪════════════╡ + │ 1.0 ┆ 6 ┆ 2020-01-02 │ + │ 2.0 ┆ 7 ┆ 2021-03-04 │ + │ 3.0 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns matching one dtype (or dtype group) to another dtype: + + >>> lf.cast({pl.Date: pl.Datetime}).collect() + shape: (3, 3) + ┌─────┬─────┬─────────────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ datetime[μs] │ + ╞═════╪═════╪═════════════════════╡ + │ 1 ┆ 6.0 ┆ 2020-01-02 00:00:00 │ + │ 2 ┆ 7.0 ┆ 2021-03-04 00:00:00 │ + │ 3 ┆ 8.0 ┆ 2022-05-06 00:00:00 │ + └─────┴─────┴─────────────────────┘ + + Use selectors to define the columns being cast: + + >>> import polars.selectors as cs + >>> lf.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.String}).collect() + shape: (3, 3) + ┌─────┬─────┬────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ str │ + ╞═════╪═════╪════════════╡ + │ 1 ┆ 6 ┆ 2020-01-02 │ + │ 2 ┆ 7 ┆ 2021-03-04 │ + │ 3 ┆ 8 ┆ 2022-05-06 │ + └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.String).collect().to_dict(as_series=False) + {\'foo\': [\'1\', \'2\', \'3\'], + \'bar\': [\'6.0\', \'7.0\', \'8.0\'], + \'ham\': [\'2020-01-02\', \'2021-03-04\', \'2022-05-06\']} + ''' + def clear(self, n: int = ...) -> LazyFrame: + ''' + Create an empty copy of the current LazyFrame, with zero to \'n\' rows. + + Returns a copy with an identical schema but no data. + + Parameters + ---------- + n + Number of (empty) rows to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clear().fetch() + shape: (0, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞═════╪═════╪══════╡ + └─────┴─────┴──────┘ + + >>> lf.clear(2).fetch() + shape: (2, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool │ + ╞══════╪══════╪══════╡ + │ null ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └──────┴──────┴──────┘ + ''' + def clone(self) -> Self: + ''' + Create a copy of this LazyFrame. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current LazyFrame, with identical + schema but no data. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, 2, 3, 4], + ... "b": [0.5, None, 2.5, 13], + ... "c": [True, True, False, None], + ... } + ... ) + >>> lf.clone() # doctest: +ELLIPSIS + + ''' + def filter(self, *predicates: IntoExprColumn | Iterable[IntoExprColumn] | bool | list[bool] | np.ndarray[Any, Any], **constraints: Any) -> Self: + ''' + Filter the rows in the LazyFrame based on a predicate expression. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + predicates + Expression that evaluates to a boolean Series. + constraints + Column filters; use `name = value` to filter columns by the supplied value. + Each constraint will behave the same as `pl.col(name).eq(value)`, and + will be implicitly joined with the other filter conditions using `&`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + + Filter on one condition: + + >>> lf.filter(pl.col("foo") > 1).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + + Filter on multiple conditions: + + >>> lf.filter((pl.col("foo") < 3) & (pl.col("ham") == "a")).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `*args` syntax: + + >>> lf.filter( + ... pl.col("foo") == 1, + ... pl.col("ham") == "a", + ... ).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Provide multiple filters using `**kwargs` syntax: + + >>> lf.filter(foo=1, ham="a").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + Filter on an OR condition: + + >>> lf.filter((pl.col("foo") == 1) | (pl.col("ham") == "c")).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + ''' + def select(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Select columns from this LazyFrame. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Examples + -------- + Pass the name of a column to select that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.select("foo").collect() + shape: (3, 1) + ┌─────┐ + │ foo │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + └─────┘ + + Multiple columns can be selected by passing a list of column names. + + >>> lf.select(["foo", "bar"]).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 6 │ + │ 2 ┆ 7 │ + │ 3 ┆ 8 │ + └─────┴─────┘ + + Multiple columns can also be selected using positional arguments instead of a + list. Expressions are also accepted. + + >>> lf.select(pl.col("foo"), pl.col("bar") + 1).collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + └─────┴─────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.select( + ... threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ threshold │ + │ --- │ + │ i32 │ + ╞═══════════╡ + │ 0 │ + │ 0 │ + │ 10 │ + └───────────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.select( + ... is_odd=(pl.col(pl.INTEGER_DTYPES) % 2).name.suffix("_is_odd"), + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ is_odd │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {1,0} │ + │ {0,1} │ + │ {1,0} │ + └───────────┘ + ''' + def select_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Select columns from this LazyFrame. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to select, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, + other non-expression inputs are parsed as literals. + **named_exprs + Additional columns to select, specified as keyword arguments. + The columns will be renamed to the keyword used. + + See Also + -------- + select + """ + def group_by(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + ''' + Start a group by operation. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Setting this to `True` blocks the possibility + to run on the streaming engine. + + Examples + -------- + Group by one column and call `agg` to compute the grouped sum of another + column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": ["a", "b", "a", "b", "c"], + ... "b": [1, 2, 1, 3, 3], + ... "c": [5, 4, 3, 2, 1], + ... } + ... ) + >>> lf.group_by("a").agg(pl.col("b").sum()).collect() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ a ┆ 2 │ + │ b ┆ 5 │ + │ c ┆ 3 │ + └─────┴─────┘ + + Set `maintain_order=True` to ensure the order of the groups is consistent with + the input. + + >>> lf.group_by("a", maintain_order=True).agg(pl.col("c")).collect() + shape: (3, 2) + ┌─────┬───────────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ str ┆ list[i64] │ + ╞═════╪═══════════╡ + │ a ┆ [5, 3] │ + │ b ┆ [4, 2] │ + │ c ┆ [1] │ + └─────┴───────────┘ + + Group by multiple columns by passing a list of column names. + + >>> lf.group_by(["a", "b"]).agg(pl.max("c")).collect() # doctest: +SKIP + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 1 ┆ 5 │ + │ b ┆ 2 ┆ 4 │ + │ b ┆ 3 ┆ 2 │ + │ c ┆ 3 ┆ 1 │ + └─────┴─────┴─────┘ + + Or use positional arguments to group by multiple columns in the same way. + Expressions are also accepted. + + >>> lf.group_by("a", pl.col("b") // 2).agg( + ... pl.col("c").mean() + ... ).collect() # doctest: +SKIP + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 0 ┆ 4.0 │ + │ b ┆ 1 ┆ 3.0 │ + │ c ┆ 1 ┆ 1.0 │ + └─────┴─────┴─────┘ + ''' + def rolling(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Create rolling groups based on a time, Int32, or Int64 column. + + Different from a `dynamic_group_by` the windows are now determined by the + individual values and are not of constant intervals. For constant intervals + use :func:`LazyFrame.group_by_dynamic`. + + If you have a time series ``, then by default the + windows created will be + + * (t_0 - period, t_0] + * (t_1 - period, t_1] + * ... + * (t_n - period, t_n] + + whereas if you pass a non-default `offset`, then the windows will be + + * (t_0 + offset, t_0 + offset + period] + * (t_1 + offset, t_1 + offset + period] + * ... + * (t_n + offset, t_n + offset + period] + + The `period` and `offset` arguments are created either from a timedelta, or + by using the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a rolling operation on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + group_by_dynamic + + Examples + -------- + >>> dates = [ + ... "2020-01-01 13:45:48", + ... "2020-01-01 16:42:13", + ... "2020-01-01 16:45:09", + ... "2020-01-02 18:12:48", + ... "2020-01-03 19:45:32", + ... "2020-01-08 23:16:43", + ... ] + >>> df = pl.LazyFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns( + ... pl.col("dt").str.strptime(pl.Datetime).set_sorted() + ... ) + >>> out = ( + ... df.rolling(index_column="dt", period="2d") + ... .agg( + ... pl.sum("a").alias("sum_a"), + ... pl.min("a").alias("min_a"), + ... pl.max("a").alias("max_a"), + ... ) + ... .collect() + ... ) + >>> out + shape: (6, 4) + ┌─────────────────────┬───────┬───────┬───────┐ + │ dt ┆ sum_a ┆ min_a ┆ max_a │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │ + ╞═════════════════════╪═══════╪═══════╪═══════╡ + │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │ + │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │ + │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │ + │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │ + │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │ + │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │ + └─────────────────────┴───────┴───────┴───────┘ + ''' + def group_by_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + Time windows are calculated and rows are assigned to windows. Different from a + normal group by is that a row can be member of multiple groups. + By default, the windows look like: + + - [start, start + period) + - [start + every, start + every + period) + - [start + 2*every, start + 2*every + period) + - ... + + where `start` is determined by `start_by`, `offset`, and `every` (see parameter + descriptions below). + + .. warning:: + The index column must be sorted in ascending order. If `by` is passed, then + the index column must be sorted in ascending order within each group. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + + .. deprecated:: 0.19.4 + Use `label` instead. + include_boundaries + Add the lower and upper bound of the window to the "_lower_boundary" and + "_upper_boundary" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'left\', \'right\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + label : {\'left\', \'right\', \'datapoint\'} + Define which label to use for the window: + + - \'left\': lower boundary of the window + - \'right\': upper boundary of the window + - \'datapoint\': the first value of the index column in the given window. + If you don\'t need the label to be at one of the boundaries, choose this + option for maximum performance + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + + See Also + -------- + rolling + + Notes + ----- + 1) If you\'re coming from pandas, then + + .. code-block:: python + + # polars + df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum()) + + is equivalent to + + .. code-block:: python + + # pandas + df.set_index("ts").resample("D")["value"].sum().reset_index() + + though note that, unlike pandas, polars doesn\'t add extra rows for empty + windows. If you need `index_column` to be evenly spaced, then please combine + with :func:`DataFrame.upsample`. + + 2) The `every`, `period` and `offset` arguments are created with + the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day (which may + not be 24 hours, due to daylight savings). Similarly for "calendar week", + "calendar month", "calendar quarter", and "calendar year". + + In case of a group_by_dynamic on an integer column, the windows are defined by: + + - "1i" # length 1 + - "10i" # length 10 + + Examples + -------- + >>> from datetime import datetime + >>> lf = pl.LazyFrame( + ... { + ... "time": pl.datetime_range( + ... start=datetime(2021, 12, 16), + ... end=datetime(2021, 12, 16, 3), + ... interval="30m", + ... eager=True, + ... ), + ... "n": range(7), + ... } + ... ) + >>> lf.collect() + shape: (7, 2) + ┌─────────────────────┬─────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i64 │ + ╞═════════════════════╪═════╡ + │ 2021-12-16 00:00:00 ┆ 0 │ + │ 2021-12-16 00:30:00 ┆ 1 │ + │ 2021-12-16 01:00:00 ┆ 2 │ + │ 2021-12-16 01:30:00 ┆ 3 │ + │ 2021-12-16 02:00:00 ┆ 4 │ + │ 2021-12-16 02:30:00 ┆ 5 │ + │ 2021-12-16 03:00:00 ┆ 6 │ + └─────────────────────┴─────┘ + + Group by windows of 1 hour starting at 2021-12-16 00:00:00. + + >>> lf.group_by_dynamic("time", every="1h", closed="right").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [1, 2] │ + │ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ 2021-12-16 02:00:00 ┆ [5, 6] │ + └─────────────────────┴───────────┘ + + The window boundaries can also be added to the aggregation result + + >>> lf.group_by_dynamic( + ... "time", every="1h", include_boundaries=True, closed="right" + ... ).agg(pl.col("n").mean()).collect() + shape: (4, 4) + ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐ + │ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │ + ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡ + │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │ + │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │ + │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │ + │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │ + └─────────────────────┴─────────────────────┴─────────────────────┴─────┘ + + When closed="left", the window excludes the right end of interval: + [lower_bound, upper_bound) + + >>> lf.group_by_dynamic("time", every="1h", closed="left").agg( + ... pl.col("n") + ... ).collect() + shape: (4, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-16 00:00:00 ┆ [0, 1] │ + │ 2021-12-16 01:00:00 ┆ [2, 3] │ + │ 2021-12-16 02:00:00 ┆ [4, 5] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + When closed="both" the time values at the window boundaries belong to 2 groups. + + >>> lf.group_by_dynamic("time", every="1h", closed="both").agg( + ... pl.col("n") + ... ).collect() + shape: (5, 2) + ┌─────────────────────┬───────────┐ + │ time ┆ n │ + │ --- ┆ --- │ + │ datetime[μs] ┆ list[i64] │ + ╞═════════════════════╪═══════════╡ + │ 2021-12-15 23:00:00 ┆ [0] │ + │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │ + │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │ + │ 2021-12-16 03:00:00 ┆ [6] │ + └─────────────────────┴───────────┘ + + Dynamic group bys can also be combined with grouping on normal keys + + >>> lf = lf.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"])) + >>> lf.collect() + shape: (7, 3) + ┌─────────────────────┬─────┬────────┐ + │ time ┆ n ┆ groups │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ i64 ┆ str │ + ╞═════════════════════╪═════╪════════╡ + │ 2021-12-16 00:00:00 ┆ 0 ┆ a │ + │ 2021-12-16 00:30:00 ┆ 1 ┆ a │ + │ 2021-12-16 01:00:00 ┆ 2 ┆ a │ + │ 2021-12-16 01:30:00 ┆ 3 ┆ b │ + │ 2021-12-16 02:00:00 ┆ 4 ┆ b │ + │ 2021-12-16 02:30:00 ┆ 5 ┆ a │ + │ 2021-12-16 03:00:00 ┆ 6 ┆ a │ + └─────────────────────┴─────┴────────┘ + >>> lf.group_by_dynamic( + ... "time", + ... every="1h", + ... closed="both", + ... by="groups", + ... include_boundaries=True, + ... ).agg(pl.col("n")).collect() + shape: (7, 5) + ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐ + │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │ + ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡ + │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ [0] │ + │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │ + │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │ + │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │ + │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │ + │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │ + │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │ + └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘ + + Dynamic group by on an index column + + >>> lf = pl.LazyFrame( + ... { + ... "idx": pl.int_range(0, 6, eager=True), + ... "A": ["A", "A", "B", "B", "B", "C"], + ... } + ... ) + >>> lf.group_by_dynamic( + ... "idx", + ... every="2i", + ... period="3i", + ... include_boundaries=True, + ... closed="right", + ... ).agg(pl.col("A").alias("A_agg_list")).collect() + shape: (4, 4) + ┌─────────────────┬─────────────────┬─────┬─────────────────┐ + │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ list[str] │ + ╞═════════════════╪═════════════════╪═════╪═════════════════╡ + │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │ + │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │ + │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │ + │ 4 ┆ 7 ┆ 4 ┆ ["C"] │ + └─────────────────┴─────────────────┴─────┴─────────────────┘ + ''' + def join_asof(self, other: LazyFrame) -> Self: + ''' + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than + equal keys. + + Both DataFrames must be sorted by the join_asof key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + \'on\' key is less than or equal to the left\'s key. + + - A "forward" search selects the first row in the right DataFrame whose + \'on\' key is greater than or equal to the left\'s key. + + A "nearest" search selects the last row in the right DataFrame whose value + is nearest to the left\'s key. String keys are not currently supported for a + nearest search. + + The default is "backward". + + Parameters + ---------- + other + Lazy DataFrame to join with. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + by + Join on these columns before doing asof join. + by_left + Join on these columns before doing asof join. + by_right + Join on these columns before doing asof join. + strategy : {\'backward\', \'forward\', \'nearest\'} + Join strategy. + suffix + Suffix to append to columns with a duplicate name. + tolerance + Numeric tolerance. By setting this the join will only be done if the near + keys are within this distance. If an asof join is done on columns of dtype + "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta + object or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + - 1i (1 index count) + + Or combine them: + "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + Examples + -------- + >>> from datetime import datetime + >>> gdp = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... ], # note record date: Jan 1st (sorted!) + ... "gdp": [4164, 4411, 4566, 4696], + ... } + ... ).set_sorted("date") + >>> population = pl.LazyFrame( + ... { + ... "date": [ + ... datetime(2016, 5, 12), + ... datetime(2017, 5, 12), + ... datetime(2018, 5, 12), + ... datetime(2019, 5, 12), + ... ], # note record date: May 12th (sorted!) + ... "population": [82.19, 82.66, 83.12, 83.52], + ... } + ... ).set_sorted("date") + >>> population.join_asof(gdp, on="date", strategy="backward").collect() + shape: (4, 3) + ┌─────────────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ + │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ + │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + ''' + def join(self, other: LazyFrame, on: str | Expr | Sequence[str | Expr] | None = ..., how: JoinStrategy = ...) -> Self: + ''' + Add a join operation to the Logical Plan. + + Parameters + ---------- + other + Lazy DataFrame to join with. + on + Join column of both DataFrames. If set, `left_on` and `right_on` should be + None. + how : {\'inner\', \'left\', \'outer\', \'semi\', \'anti\', \'cross\', \'outer_coalesce\'} + Join strategy. + + * *inner* + Returns rows that have matching values in both tables + * *left* + Returns all rows from the left table, and the matched rows from the + right table + * *outer* + Returns all rows when there is a match in either left or right table + * *outer_coalesce* + Same as \'outer\', but coalesces the key columns + * *cross* + Returns the cartisian product of rows from both tables + * *semi* + Filter rows that have a match in the right table. + * *anti* + Filter rows that not have a match in the right table. + + .. note:: + A left join preserves the row order of the left DataFrame. + left_on + Join column of the left DataFrame. + right_on + Join column of the right DataFrame. + suffix + Suffix to append to columns with a duplicate name. + validate: {\'m:m\', \'m:1\', \'1:m\', \'1:1\'} + Checks if join is of specified type. + + * *many_to_many* + “m:m”: default, does not result in checks + * *one_to_one* + “1:1”: check if join keys are unique in both left and right datasets + * *one_to_many* + “1:m”: check if join keys are unique in left dataset + * *many_to_one* + “m:1”: check if join keys are unique in right dataset + + .. note:: + + - This is currently not supported the streaming engine. + - This is only supported when joined by single columns. + join_nulls + Join on null values. By default null values will never produce matches. + allow_parallel + Allow the physical plan to optionally evaluate the computation of both + DataFrames up to the join in parallel. + force_parallel + Force the physical plan to evaluate the computation of both DataFrames up to + the join in parallel. + + See Also + -------- + join_asof + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> other_lf = pl.LazyFrame( + ... { + ... "apple": ["x", "y", "z"], + ... "ham": ["a", "b", "d"], + ... } + ... ) + >>> lf.join(other_lf, on="ham").collect() + shape: (2, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="outer").collect() + shape: (4, 5) + ┌──────┬──────┬──────┬───────┬───────────┐ + │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str ┆ str │ + ╞══════╪══════╪══════╪═══════╪═══════════╡ + │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │ + │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │ + │ null ┆ null ┆ null ┆ z ┆ d │ + │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │ + └──────┴──────┴──────┴───────┴───────────┘ + >>> lf.join(other_lf, on="ham", how="left").collect() + shape: (3, 4) + ┌─────┬─────┬─────┬───────┐ + │ foo ┆ bar ┆ ham ┆ apple │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str ┆ str │ + ╞═════╪═════╪═════╪═══════╡ + │ 1 ┆ 6.0 ┆ a ┆ x │ + │ 2 ┆ 7.0 ┆ b ┆ y │ + │ 3 ┆ 8.0 ┆ c ┆ null │ + └─────┴─────┴─────┴───────┘ + >>> lf.join(other_lf, on="ham", how="semi").collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + └─────┴─────┴─────┘ + >>> lf.join(other_lf, on="ham", how="anti").collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + ''' + def with_columns(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + ''' + Add columns to this LazyFrame. + + Added columns will replace existing columns with the same name. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + Notes + ----- + Creating a new LazyFrame using this method does not create a new copy of + existing data. + + Examples + -------- + Pass an expression to add it as a new column. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [0.5, 4, 10, 13], + ... "c": [True, True, False, True], + ... } + ... ) + >>> lf.with_columns((pl.col("a") ** 2).alias("a^2")).collect() + shape: (4, 4) + ┌─────┬──────┬───────┬──────┐ + │ a ┆ b ┆ c ┆ a^2 │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 │ + ╞═════╪══════╪═══════╪══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 │ + └─────┴──────┴───────┴──────┘ + + Added columns will replace existing columns with the same name. + + >>> lf.with_columns(pl.col("a").cast(pl.Float64)).collect() + shape: (4, 3) + ┌─────┬──────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╡ + │ 1.0 ┆ 0.5 ┆ true │ + │ 2.0 ┆ 4.0 ┆ true │ + │ 3.0 ┆ 10.0 ┆ false │ + │ 4.0 ┆ 13.0 ┆ true │ + └─────┴──────┴───────┘ + + Multiple columns can be added by passing a list of expressions. + + >>> lf.with_columns( + ... [ + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ] + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Multiple columns also can be added using positional arguments instead of a list. + + >>> lf.with_columns( + ... (pl.col("a") ** 2).alias("a^2"), + ... (pl.col("b") / 2).alias("b/2"), + ... (pl.col("c").not_()).alias("not c"), + ... ).collect() + shape: (4, 6) + ┌─────┬──────┬───────┬──────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │ + └─────┴──────┴───────┴──────┴──────┴───────┘ + + Use keyword arguments to easily name your expression inputs. + + >>> lf.with_columns( + ... ab=pl.col("a") * pl.col("b"), + ... not_c=pl.col("c").not_(), + ... ).collect() + shape: (4, 5) + ┌─────┬──────┬───────┬──────┬───────┐ + │ a ┆ b ┆ c ┆ ab ┆ not_c │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │ + ╞═════╪══════╪═══════╪══════╪═══════╡ + │ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │ + │ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │ + │ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │ + │ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │ + └─────┴──────┴───────┴──────┴───────┘ + + Expressions with multiple outputs can be automatically instantiated as Structs + by enabling the setting `Config.set_auto_structify(True)`: + + >>> with pl.Config(auto_structify=True): + ... lf.drop("c").with_columns( + ... diffs=pl.col(["a", "b"]).diff().name.suffix("_diff"), + ... ).collect() + shape: (4, 3) + ┌─────┬──────┬─────────────┐ + │ a ┆ b ┆ diffs │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ struct[2] │ + ╞═════╪══════╪═════════════╡ + │ 1 ┆ 0.5 ┆ {null,null} │ + │ 2 ┆ 4.0 ┆ {1,3.5} │ + │ 3 ┆ 10.0 ┆ {1,6.0} │ + │ 4 ┆ 13.0 ┆ {1,3.0} │ + └─────┴──────┴─────────────┘ + ''' + def with_columns_seq(self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr) -> Self: + """ + Add columns to this LazyFrame. + + Added columns will replace existing columns with the same name. + + This will run all expression sequentially instead of in parallel. + Use this when the work per expression is cheap. + + Parameters + ---------- + *exprs + Column(s) to add, specified as positional arguments. + Accepts expression input. Strings are parsed as column names, other + non-expression inputs are parsed as literals. + **named_exprs + Additional columns to add, specified as keyword arguments. + The columns will be renamed to the keyword used. + + Returns + ------- + LazyFrame + A new LazyFrame with the columns added. + + See Also + -------- + with_columns + """ + def with_context(self, other: Self | list[Self]) -> Self: + ''' + Add an external context to the computation graph. + + This allows expressions to also access columns from DataFrames + that are not part of this one. + + Parameters + ---------- + other + Lazy DataFrame to join with. + + Examples + -------- + >>> lf = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "c", None]}) + >>> lf_other = pl.LazyFrame({"c": ["foo", "ham"]}) + >>> lf.with_context(lf_other).select( + ... pl.col("b") + pl.col("c").first() + ... ).collect() + shape: (3, 1) + ┌──────┐ + │ b │ + │ --- │ + │ str │ + ╞══════╡ + │ afoo │ + │ cfoo │ + │ null │ + └──────┘ + + Fill nulls with the median from another DataFrame: + + >>> train_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, 0, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf = pl.LazyFrame( + ... {"feature_0": [-1.0, None, 1], "feature_1": [-1.0, 0, 1]} + ... ) + >>> test_lf.with_context( + ... train_lf.select(pl.all().name.suffix("_train")) + ... ).select( + ... pl.col("feature_0").fill_null(pl.col("feature_0_train").median()) + ... ).collect() + shape: (3, 1) + ┌───────────┐ + │ feature_0 │ + │ --- │ + │ f64 │ + ╞═══════════╡ + │ -1.0 │ + │ 0.0 │ + │ 1.0 │ + └───────────┘ + ''' + def drop(self, *columns: ColumnNameOrSelector | Iterable[ColumnNameOrSelector]) -> Self: + ''' + Remove columns from the DataFrame. + + Parameters + ---------- + *columns + Names of the columns that should be removed from the dataframe. + Accepts column selector input. + + Examples + -------- + Drop a single column by passing the name of that column. + + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.drop("ham").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ foo ┆ bar │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪═════╡ + │ 1 ┆ 6.0 │ + │ 2 ┆ 7.0 │ + │ 3 ┆ 8.0 │ + └─────┴─────┘ + + Drop multiple columns by passing a selector. + + >>> import polars.selectors as cs + >>> lf.drop(cs.numeric()).collect() + shape: (3, 1) + ┌─────┐ + │ ham │ + │ --- │ + │ str │ + ╞═════╡ + │ a │ + │ b │ + │ c │ + └─────┘ + + Use positional arguments to drop multiple columns. + + >>> lf.drop("foo", "ham").collect() + shape: (3, 1) + ┌─────┐ + │ bar │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 6.0 │ + │ 7.0 │ + │ 8.0 │ + └─────┘ + ''' + def rename(self, mapping: dict[str, str] | Callable[[str], str]) -> Self: + ''' + Rename column names. + + Parameters + ---------- + mapping + Key value pairs that map from old name to new name, or a function + that takes the old name as input and returns the new name. + + Notes + ----- + If existing names are swapped (e.g. \'A\' points to \'B\' and \'B\' points to \'A\'), + polars will block projection and predicate pushdowns at this node. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, 7, 8], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.rename({"foo": "apple"}).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ apple ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═══════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └───────┴─────┴─────┘ + >>> lf.rename(lambda column_name: "c" + column_name[1:]).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ coo ┆ car ┆ cam │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + │ 2 ┆ 7 ┆ b │ + │ 3 ┆ 8 ┆ c │ + └─────┴─────┴─────┘ + ''' + def reverse(self) -> Self: + ''' + Reverse the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "key": ["a", "b", "c"], + ... "val": [1, 2, 3], + ... } + ... ) + >>> lf.reverse().collect() + shape: (3, 2) + ┌─────┬─────┐ + │ key ┆ val │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════╪═════╡ + │ c ┆ 3 │ + │ b ┆ 2 │ + │ a ┆ 1 │ + └─────┴─────┘ + ''' + def shift(self, n: int | IntoExprColumn = ...) -> Self: + ''' + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.shift().collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ null ┆ null │ + │ 1 ┆ 5 │ + │ 2 ┆ 6 │ + │ 3 ┆ 7 │ + └──────┴──────┘ + + Pass a negative value to shift in the opposite direction instead. + + >>> lf.shift(-2).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞══════╪══════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ null ┆ null │ + │ null ┆ null │ + └──────┴──────┘ + + Specify `fill_value` to fill the resulting null values. + + >>> lf.shift(-2, fill_value=100).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 7 │ + │ 4 ┆ 8 │ + │ 100 ┆ 100 │ + │ 100 ┆ 100 │ + └─────┴─────┘ + ''' + def slice(self, offset: int, length: int | None = ...) -> Self: + ''' + Get a slice of this DataFrame. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> lf.slice(1, 2).collect() + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ y ┆ 3 ┆ 4 │ + │ z ┆ 5 ┆ 6 │ + └─────┴─────┴─────┘ + ''' + def limit(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Alias for :func:`LazyFrame.head`. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.limit().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.limit(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + ''' + def head(self, n: int = ...) -> Self: + ''' + Get the first `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Notes + ----- + Consider using the :func:`fetch` operation if you only want to test your + query. The :func:`fetch` operation will load the first `n` rows at the scan + level, whereas the :func:`head`/:func:`limit` are applied at the end. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.head().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + └─────┴─────┘ + >>> lf.head(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 7 │ + │ 2 ┆ 8 │ + └─────┴─────┘ + ''' + def tail(self, n: int = ...) -> Self: + ''' + Get the last `n` rows. + + Parameters + ---------- + n + Number of rows to return. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4, 5, 6], + ... "b": [7, 8, 9, 10, 11, 12], + ... } + ... ) + >>> lf.tail().collect() + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 3 ┆ 9 │ + │ 4 ┆ 10 │ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + >>> lf.tail(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 11 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + ''' + def last(self) -> Self: + ''' + Get the last row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.last().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 5 ┆ 6 │ + └─────┴─────┘ + ''' + def first(self) -> Self: + ''' + Get the first row of the DataFrame. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.first().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 2 │ + └─────┴─────┘ + ''' + def approx_n_unique(self) -> Self: + ''' + Approximate count of unique values. + + This is done using the HyperLogLog++ algorithm for cardinality estimation. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.approx_n_unique().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ u32 ┆ u32 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + ''' + def with_row_index(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a row index as the first column in the LazyFrame. + + Parameters + ---------- + name + Name of the index column. + offset + Start the index at this offset. Cannot be negative. + + Warnings + -------- + Using this function can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Notes + ----- + The resulting column does not have any special properties. It is a regular + column of type `UInt32` (or `UInt64` in `polars-u64-idx`). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_index().collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ index ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞═══════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └───────┴─────┴─────┘ + >>> lf.with_row_index("id", offset=1000).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ id ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1000 ┆ 1 ┆ 2 │ + │ 1001 ┆ 3 ┆ 4 │ + │ 1002 ┆ 5 ┆ 6 │ + └──────┴─────┴─────┘ + + An index column can also be created using the expressions :func:`int_range` + and :func:`len`. + + >>> lf.select( + ... pl.int_range(pl.len(), dtype=pl.UInt32).alias("index"), + ... pl.all(), + ... ).collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ index ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞═══════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └───────┴─────┴─────┘ + ''' + def with_row_count(self, name: str = ..., offset: int = ...) -> Self: + ''' + Add a column at index 0 that counts the rows. + + .. deprecated:: + Use :meth:`with_row_index` instead. + Note that the default column name has changed from \'row_nr\' to \'index\'. + + Parameters + ---------- + name + Name of the column to add. + offset + Start the row count at this offset. + + Warnings + -------- + This can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_count().collect() # doctest: +SKIP + shape: (3, 3) + ┌────────┬─────┬─────┐ + │ row_nr ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞════════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └────────┴─────┴─────┘ + ''' + def gather_every(self, n: int, offset: int = ...) -> Self: + ''' + Take every nth row in the LazyFrame and return as a new LazyFrame. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [5, 6, 7, 8], + ... } + ... ) + >>> lf.gather_every(2).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 5 │ + │ 3 ┆ 7 │ + └─────┴─────┘ + >>> lf.gather_every(2, offset=1).collect() + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 6 │ + │ 4 ┆ 8 │ + └─────┴─────┘ + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Self: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + matches_supertype + Fill all matching supertypes of the fill `value` literal. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, None, 4], + ... "b": [0.5, 4, None, 13], + ... } + ... ) + >>> lf.fill_null(99).collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 99 ┆ 99.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + >>> lf.fill_null(strategy="forward").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="max").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 4 ┆ 13.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + + >>> lf.fill_null(strategy="zero").collect() + shape: (4, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ f64 │ + ╞═════╪══════╡ + │ 1 ┆ 0.5 │ + │ 2 ┆ 4.0 │ + │ 0 ┆ 0.0 │ + │ 4 ┆ 13.0 │ + └─────┴──────┘ + ''' + def fill_nan(self, value: int | float | Expr | None) -> Self: + ''' + Fill floating point NaN values. + + Parameters + ---------- + value + Value to fill the NaN values with. + + Warnings + -------- + Note that floating point NaN (Not a Number) are not missing values! + To replace missing values, use :func:`fill_null` instead. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1.5, 2, float("nan"), 4], + ... "b": [0.5, 4, float("nan"), 13], + ... } + ... ) + >>> lf.fill_nan(99).collect() + shape: (4, 2) + ┌──────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════╡ + │ 1.5 ┆ 0.5 │ + │ 2.0 ┆ 4.0 │ + │ 99.0 ┆ 99.0 │ + │ 4.0 ┆ 13.0 │ + └──────┴──────┘ + ''' + def std(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their standard deviation value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.std().collect() + shape: (1, 2) + ┌──────────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪═════╡ + │ 1.290994 ┆ 0.5 │ + └──────────┴─────┘ + >>> lf.std(ddof=0).collect() + shape: (1, 2) + ┌──────────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════════╡ + │ 1.118034 ┆ 0.433013 │ + └──────────┴──────────┘ + ''' + def var(self, ddof: int = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their variance value. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.var().collect() + shape: (1, 2) + ┌──────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════════╪══════╡ + │ 1.666667 ┆ 0.25 │ + └──────────┴──────┘ + >>> lf.var(ddof=0).collect() + shape: (1, 2) + ┌──────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪════════╡ + │ 1.25 ┆ 0.1875 │ + └──────┴────────┘ + ''' + def max(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their maximum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.max().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 2 │ + └─────┴─────┘ + ''' + def min(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their minimum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.min().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + └─────┴─────┘ + ''' + def sum(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their sum value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.sum().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 10 ┆ 5 │ + └─────┴─────┘ + ''' + def mean(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their mean value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.mean().collect() + shape: (1, 2) + ┌─────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪══════╡ + │ 2.5 ┆ 1.25 │ + └─────┴──────┘ + ''' + def median(self) -> Self: + ''' + Aggregate the columns in the LazyFrame to their median value. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.median().collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 2.5 ┆ 1.0 │ + └─────┴─────┘ + ''' + def null_count(self) -> Self: + ''' + Aggregate the columns in the LazyFrame as the sum of their null value count. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + ... ) + >>> lf.null_count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 1 ┆ 0 │ + └─────┴─────┴─────┘ + ''' + def quantile(self, quantile: float | Expr, interpolation: RollingInterpolationMethod = ...) -> Self: + ''' + Aggregate the columns in the LazyFrame to their quantile value. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 2, 3, 4], + ... "b": [1, 2, 1, 1], + ... } + ... ) + >>> lf.quantile(0.7).collect() + shape: (1, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞═════╪═════╡ + │ 3.0 ┆ 1.0 │ + └─────┴─────┘ + ''' + def explode(self, columns: str | Expr | Sequence[str | Expr], *more_columns: str | Expr) -> Self: + ''' + Explode the DataFrame to long format by exploding the given columns. + + Parameters + ---------- + columns + Column names, expressions, or a selector defining them. The underlying + columns being exploded must be of List or String datatype. + *more_columns + Additional names of columns to explode, specified as positional arguments. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "letters": ["a", "a", "b", "c"], + ... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]], + ... } + ... ) + >>> lf.explode("numbers").collect() + shape: (8, 2) + ┌─────────┬─────────┐ + │ letters ┆ numbers │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═════════╪═════════╡ + │ a ┆ 1 │ + │ a ┆ 2 │ + │ a ┆ 3 │ + │ b ┆ 4 │ + │ b ┆ 5 │ + │ c ┆ 6 │ + │ c ┆ 7 │ + │ c ┆ 8 │ + └─────────┴─────────┘ + ''' + def unique(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop duplicate rows from this DataFrame. + + Parameters + ---------- + subset + Column name(s) or selector(s), to consider when identifying + duplicate rows. If set to `None` (default), use all columns. + keep : {\'first\', \'last\', \'any\', \'none\'} + Which of the duplicate rows to keep. + + * \'any\': Does not give any guarantee of which row is kept. + This allows more optimizations. + * \'none\': Don\'t keep duplicate rows. + * \'first\': Keep first unique row. + * \'last\': Keep last unique row. + maintain_order + Keep the same order as the original DataFrame. This is more expensive to + compute. + Settings this to `True` blocks the possibility + to run on the streaming engine. + + Returns + ------- + LazyFrame + LazyFrame with unique rows. + + Warnings + -------- + This method will fail if there is a column of type `List` in the DataFrame or + subset. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3, 1], + ... "bar": ["a", "a", "a", "a"], + ... "ham": ["b", "b", "b", "b"], + ... } + ... ) + >>> lf.unique(maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(subset=["bar", "ham"], maintain_order=True).collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + >>> lf.unique(keep="last", maintain_order=True).collect() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str │ + ╞═════╪═════╪═════╡ + │ 2 ┆ a ┆ b │ + │ 3 ┆ a ┆ b │ + │ 1 ┆ a ┆ b │ + └─────┴─────┴─────┘ + ''' + def drop_nulls(self, subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = ...) -> Self: + ''' + Drop all rows that contain null values. + + The original order of the remaining rows is preserved. + + Parameters + ---------- + subset + Column name(s) for which null values are considered. + If set to `None` (default), use all columns. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, 2, 3], + ... "bar": [6, None, 8], + ... "ham": ["a", "b", None], + ... } + ... ) + + The default behavior of this method is to drop rows where any single + value of the row is null. + + >>> lf.drop_nulls().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6 ┆ a │ + └─────┴─────┴─────┘ + + This behaviour can be constrained to consider only a subset of columns, as + defined by name or with a selector. For example, dropping rows if there is + a null in any of the integer columns: + + >>> import polars.selectors as cs + >>> lf.drop_nulls(subset=cs.integer()).collect() + shape: (2, 3) + ┌─────┬─────┬──────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ a │ + │ 3 ┆ 8 ┆ null │ + └─────┴─────┴──────┘ + + This method drops a row if any single value of the row is null. + + Below are some example snippets that show how you could drop null + values based on other conditions: + + >>> lf = pl.LazyFrame( + ... { + ... "a": [None, None, None, None], + ... "b": [1, 2, None, 1], + ... "c": [1, None, None, 1], + ... } + ... ) + >>> lf.collect() + shape: (4, 3) + ┌──────┬──────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ null ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴──────┴──────┘ + + Drop a row only if all values are null: + + >>> lf.filter(~pl.all_horizontal(pl.all().is_null())).collect() + shape: (3, 3) + ┌──────┬─────┬──────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ null ┆ i64 ┆ i64 │ + ╞══════╪═════╪══════╡ + │ null ┆ 1 ┆ 1 │ + │ null ┆ 2 ┆ null │ + │ null ┆ 1 ┆ 1 │ + └──────┴─────┴──────┘ + ''' + def melt(self, id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = ..., variable_name: str | None = ..., value_name: str | None = ...) -> Self: + ''' + Unpivot a DataFrame from wide to long format. + + Optionally leaves identifiers set. + + This function is useful to massage a DataFrame into a format where one or more + columns are identifier variables (id_vars) while all other columns, considered + measured variables (value_vars), are "unpivoted" to the row axis leaving just + two non-identifier columns, \'variable\' and \'value\'. + + Parameters + ---------- + id_vars + Column(s) or selector(s) to use as identifier variables. + value_vars + Column(s) or selector(s) to use as values variables; if `value_vars` + is empty all columns that are not in `id_vars` will be used. + variable_name + Name to give to the `variable` column. Defaults to "variable" + value_name + Name to give to the `value` column. Defaults to "value" + streamable + Allow this node to run in the streaming engine. + If this runs in streaming, the output of the melt operation + will not have a stable ordering. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": ["x", "y", "z"], + ... "b": [1, 3, 5], + ... "c": [2, 4, 6], + ... } + ... ) + >>> import polars.selectors as cs + >>> lf.melt(id_vars="a", value_vars=cs.numeric()).collect() + shape: (6, 3) + ┌─────┬──────────┬───────┐ + │ a ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 │ + ╞═════╪══════════╪═══════╡ + │ x ┆ b ┆ 1 │ + │ y ┆ b ┆ 3 │ + │ z ┆ b ┆ 5 │ + │ x ┆ c ┆ 2 │ + │ y ┆ c ┆ 4 │ + │ z ┆ c ┆ 6 │ + └─────┴──────────┴───────┘ + ''' + def map_batches(self, function: Callable[[DataFrame], DataFrame]) -> Self: + ''' + Apply a custom function. + + It is important that the function returns a Polars DataFrame. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars\' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + + Warnings + -------- + The `schema` of a `LazyFrame` must always be correct. It is up to the caller + of this function to ensure that this invariant is upheld. + + It is important that the optimization flags are correct. If the custom function + for instance does an aggregation of a column, `predicate_pushdown` should not + be allowed, as this prunes rows and will influence your aggregation results. + + Examples + -------- + >>> lf = ( # doctest: +SKIP + ... pl.LazyFrame( + ... { + ... "a": pl.int_range(-100_000, 0, eager=True), + ... "b": pl.int_range(0, 100_000, eager=True), + ... } + ... ) + ... .map_batches(lambda x: 2 * x, streamable=True) + ... .collect(streaming=True) + ... ) + shape: (100_000, 2) + ┌─────────┬────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════════╪════════╡ + │ -200000 ┆ 0 │ + │ -199998 ┆ 2 │ + │ -199996 ┆ 4 │ + │ -199994 ┆ 6 │ + │ … ┆ … │ + │ -8 ┆ 199992 │ + │ -6 ┆ 199994 │ + │ -4 ┆ 199996 │ + │ -2 ┆ 199998 │ + └─────────┴────────┘ + ''' + def interpolate(self) -> Self: + ''' + Interpolate intermediate values. The interpolation method is linear. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "foo": [1, None, 9, 10], + ... "bar": [6, 7, 9, None], + ... "baz": [1, None, None, 9], + ... } + ... ) + >>> lf.interpolate().collect() + shape: (4, 3) + ┌──────┬──────┬──────────┐ + │ foo ┆ bar ┆ baz │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 │ + ╞══════╪══════╪══════════╡ + │ 1.0 ┆ 6.0 ┆ 1.0 │ + │ 5.0 ┆ 7.0 ┆ 3.666667 │ + │ 9.0 ┆ 9.0 ┆ 6.333333 │ + │ 10.0 ┆ null ┆ 9.0 │ + └──────┴──────┴──────────┘ + ''' + def unnest(self, columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector) -> Self: + ''' + Decompose struct columns into separate columns for each of their fields. + + The new columns will be inserted into the DataFrame at the location of the + struct column. + + Parameters + ---------- + columns + Name of the struct column(s) that should be unnested. + *more_columns + Additional columns to unnest, specified as positional arguments. + + Examples + -------- + >>> df = pl.LazyFrame( + ... { + ... "before": ["foo", "bar"], + ... "t_a": [1, 2], + ... "t_b": ["a", "b"], + ... "t_c": [True, None], + ... "t_d": [[1, 2], [3]], + ... "after": ["baz", "womp"], + ... } + ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after") + >>> df.collect() + shape: (2, 3) + ┌────────┬─────────────────────┬───────┐ + │ before ┆ t_struct ┆ after │ + │ --- ┆ --- ┆ --- │ + │ str ┆ struct[4] ┆ str │ + ╞════════╪═════════════════════╪═══════╡ + │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │ + │ bar ┆ {2,"b",null,[3]} ┆ womp │ + └────────┴─────────────────────┴───────┘ + >>> df.unnest("t_struct").collect() + shape: (2, 6) + ┌────────┬─────┬─────┬──────┬───────────┬───────┐ + │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │ + ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡ + │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │ + │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │ + └────────┴─────┴─────┴──────┴───────────┴───────┘ + ''' + def merge_sorted(self, other: LazyFrame, key: str) -> Self: + ''' + Take two sorted DataFrames and merge them by the sorted key. + + The output of this operation will also be sorted. + It is the callers responsibility that the frames are sorted + by that key otherwise the output will not make sense. + + The schemas of both LazyFrames must be equal. + + Parameters + ---------- + other + Other DataFrame that must be merged + key + Key that is sorted. + + Examples + -------- + >>> df0 = pl.LazyFrame( + ... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]} + ... ).sort("age") + >>> df0.collect() + shape: (3, 2) + ┌───────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞═══════╪═════╡ + │ bob ┆ 18 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └───────┴─────┘ + >>> df1 = pl.LazyFrame( + ... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]} + ... ).sort("age") + >>> df1.collect() + shape: (4, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + └────────┴─────┘ + >>> df0.merge_sorted(df1, key="age").collect() + shape: (7, 2) + ┌────────┬─────┐ + │ name ┆ age │ + │ --- ┆ --- │ + │ str ┆ i64 │ + ╞════════╪═════╡ + │ bob ┆ 18 │ + │ thomas ┆ 20 │ + │ anna ┆ 21 │ + │ megan ┆ 33 │ + │ steve ┆ 42 │ + │ steve ┆ 42 │ + │ elise ┆ 44 │ + └────────┴─────┘ + ''' + def set_sorted(self, column: str | Iterable[str], *more_columns: str) -> Self: + """ + Indicate that one or multiple columns are sorted. + + Parameters + ---------- + column + Columns that are sorted + more_columns + Additional columns that are sorted, specified as positional arguments. + descending + Whether the columns are sorted in descending order. + """ + def update(self, other: LazyFrame, on: str | Sequence[str] | None = ..., how: Literal['left', 'inner', 'outer'] = ...) -> Self: + ''' + Update the values in this `LazyFrame` with the non-null values in `other`. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + other + LazyFrame that will be used to update the values + on + Column names that will be joined on. If set to `None` (default), + the implicit row index of each frame is used as a join key. + how : {\'left\', \'inner\', \'outer\'} + * \'left\' will keep all rows from the left table; rows may be duplicated + if multiple rows in the right frame match the left row\'s key. + * \'inner\' keeps only those rows where the key exists in both frames. + * \'outer\' will update existing rows where the key matches while also + adding any new rows contained in the given frame. + left_on + Join column(s) of the left DataFrame. + right_on + Join column(s) of the right DataFrame. + include_nulls + If True, null values from the right DataFrame will be used to update the + left DataFrame. + + Notes + ----- + This is syntactic sugar for a left/inner join, with an optional coalesce when + `include_nulls = False`. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "A": [1, 2, 3, 4], + ... "B": [400, 500, 600, 700], + ... } + ... ) + >>> lf.collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 400 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + >>> new_lf = pl.LazyFrame( + ... { + ... "B": [-66, None, -99], + ... "C": [5, 3, 1], + ... } + ... ) + + Update `df` values with the non-null values in `new_df`, by row index: + + >>> lf.update(new_lf).collect() + shape: (4, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + │ 4 ┆ 700 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, by row index, + but only keeping those rows that are common to both frames: + + >>> lf.update(new_lf, how="inner").collect() + shape: (3, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -66 │ + │ 2 ┆ 500 │ + │ 3 ┆ -99 │ + └─────┴─────┘ + + Update `df` values with the non-null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect() + shape: (5, 2) + ┌─────┬─────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ 600 │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴─────┘ + + Update `df` values including null values in `new_df`, using an outer join + strategy that defines explicit join columns in each frame: + + >>> lf.update( + ... new_lf, left_on="A", right_on="C", how="outer", include_nulls=True + ... ).collect() + shape: (5, 2) + ┌─────┬──────┐ + │ A ┆ B │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪══════╡ + │ 1 ┆ -99 │ + │ 2 ┆ 500 │ + │ 3 ┆ null │ + │ 4 ┆ 700 │ + │ 5 ┆ -66 │ + └─────┴──────┘ + ''' + def count(self) -> Self: + ''' + Return the number of non-null elements for each column. + + Examples + -------- + >>> lf = pl.LazyFrame( + ... {"a": [1, 2, 3, 4], "b": [1, 2, 1, None], "c": [None, None, None, None]} + ... ) + >>> lf.count().collect() + shape: (1, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ u32 ┆ u32 │ + ╞═════╪═════╪═════╡ + │ 4 ┆ 3 ┆ 0 │ + └─────┴─────┴─────┘ + ''' + def groupby(self, by: IntoExpr | Iterable[IntoExpr], *more_by: IntoExpr) -> LazyGroupBy: + """ + Start a group by operation. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by`. + + Parameters + ---------- + by + Column(s) to group by. Accepts expression input. Strings are parsed as + column names. + *more_by + Additional columns to group by, specified as positional arguments. + maintain_order + Ensure that the order of the groups is consistent with the input data. + This is slower than a default group by. + Settings this to `True` blocks the possibility + to run on the streaming engine. + """ + def groupby_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + """ + def group_by_rolling(self, index_column: IntoExpr) -> LazyGroupBy: + """ + Create rolling groups based on a time, Int32, or Int64 column. + + .. deprecated:: 0.19.9 + This method has been renamed to :func:`LazyFrame.rolling`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a rolling group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + period + length of the window - must be non-negative + offset + offset of the window. Default is -period + closed : {'right', 'left', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + """ + def groupby_dynamic(self, index_column: IntoExpr) -> LazyGroupBy: + ''' + Group based on a time value (or index value of type Int32, Int64). + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.group_by_dynamic`. + + Parameters + ---------- + index_column + Column used to group based on the time window. + Often of type Date/Datetime. + This column must be sorted in ascending order (or, if `by` is specified, + then it must be sorted in ascending order within each group). + + In case of a dynamic group by on indices, dtype needs to be one of + {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if + performance matters use an Int64 column. + every + interval of the window + period + length of the window, if None it will equal \'every\' + offset + offset of the window, only takes effect if `start_by` is `\'window\'`. + Defaults to negative `every`. + truncate + truncate the time value to the window lower bound + include_boundaries + Add the lower and upper bound of the window to the "_lower_bound" and + "_upper_bound" columns. This will impact performance because it\'s harder to + parallelize + closed : {\'right\', \'left\', \'both\', \'none\'} + Define which sides of the temporal interval are closed (inclusive). + by + Also group by this column/these columns + start_by : {\'window\', \'datapoint\', \'monday\', \'tuesday\', \'wednesday\', \'thursday\', \'friday\', \'saturday\', \'sunday\'} + The strategy to determine the start of the first window by. + + * \'window\': Start by taking the earliest timestamp, truncating it with + `every`, and then adding `offset`. + Note that weekly windows start on Monday. + * \'datapoint\': Start from the first encountered data point. + * a day of the week (only takes effect if `every` contains `\'w\'`): + + * \'monday\': Start the window on the Monday before the first data point. + * \'tuesday\': Start the window on the Tuesday before the first data point. + * ... + * \'sunday\': Start the window on the Sunday before the first data point. + check_sorted + When the `by` argument is given, polars can not check sortedness + by the metadata and has to do a full scan on the index column to + verify data is sorted. This is expensive. If you are sure the + data within the by groups is sorted, you can set this to `False`. + Doing so incorrectly will lead to incorrect output + + Returns + ------- + LazyGroupBy + Object you can call `.agg` on to aggregate by groups, the result + of which will be sorted by `index_column` (but note that if `by` columns are + passed, it will only be sorted within each `by` group). + ''' + def map(self, function: Callable[[DataFrame], DataFrame]) -> Self: + """ + Apply a custom function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`LazyFrame.map_batches`. + + Parameters + ---------- + function + Lambda/ function to apply. + predicate_pushdown + Allow predicate pushdown optimization to pass this node. + projection_pushdown + Allow projection pushdown optimization to pass this node. + slice_pushdown + Allow slice pushdown optimization to pass this node. + no_optimizations + Turn off all optimizations past this point. + schema + Output schema of the function, if set to `None` we assume that the schema + will remain unchanged by the applied function. + validate_output_schema + It is paramount that polars' schema is correct. This flag will ensure that + the output schema of this function will be checked with the expected schema. + Setting this to `False` will not do this check, but may lead to hard to + debug bugs. + streamable + Whether the function that is given is eligible to be running with the + streaming engine. That means that the function must produce the same result + when it is executed in batches or when it is be executed on the full + dataset. + """ + def shift_and_fill(self, fill_value: Expr | int | str | float) -> Self: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + fill None values with the result of this expression. + n + Number of places to shift (may be negative). + """ + def take_every(self, n: int, offset: int = ...) -> Self: + """ + Take every nth row in the LazyFrame and return as a new LazyFrame. + + .. deprecated:: 0.19.0 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + @property + def columns(self): ... + @property + def dtypes(self): ... + @property + def schema(self): ... + @property + def width(self): ... diff --git a/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.6/polars/series/series.pyi b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.6/polars/series/series.pyi new file mode 100644 index 0000000..65d2f9e --- /dev/null +++ b/polugins_type_gen/src/polugins_type_gen/_stubs/0.20.6/polars/series/series.pyi @@ -0,0 +1,5254 @@ +#: version 0.20.6 +import np as np +import pa as pa +import pd as pd +from builtins import PySeries +from datetime import date, datetime, timedelta +from polars.datatypes.classes import Array as Array, Boolean as Boolean, Categorical as Categorical, Date as Date, Datetime as Datetime, Decimal as Decimal, Duration as Duration, Enum as Enum, Float64 as Float64, Int16 as Int16, Int32 as Int32, Int64 as Int64, Int8 as Int8, List as List, Null as Null, Object as Object, String as String, Time as Time, UInt32 as UInt32, UInt64 as UInt64, Unknown as Unknown +from polars.datatypes.convert import dtype_to_ctype as dtype_to_ctype, is_polars_dtype as is_polars_dtype, maybe_cast as maybe_cast, numpy_char_code_to_dtype as numpy_char_code_to_dtype, py_type_to_dtype as py_type_to_dtype, supported_numpy_char_code as supported_numpy_char_code +from polars.dependencies import _check_for_numpy as _check_for_numpy, _check_for_pandas as _check_for_pandas, _check_for_pyarrow as _check_for_pyarrow, dataframe_api_compat as dataframe_api_compat, hvplot as hvplot +from polars.exceptions import ModuleUpgradeRequired as ModuleUpgradeRequired, ShapeError as ShapeError +from polars.series.array import ArrayNameSpace as ArrayNameSpace +from polars.series.binary import BinaryNameSpace as BinaryNameSpace +from polars.series.categorical import CatNameSpace as CatNameSpace +from polars.series.datetime import DateTimeNameSpace as DateTimeNameSpace +from polars.series.list import ListNameSpace as ListNameSpace +from polars.series.string import StringNameSpace as StringNameSpace +from polars.series.struct import StructNameSpace as StructNameSpace +from polars.series.utils import expr_dispatch as expr_dispatch, get_ffi_func as get_ffi_func +from polars.slice import PolarsSlice as PolarsSlice +from polars.utils._construction import arrow_to_pyseries as arrow_to_pyseries, dataframe_to_pyseries as dataframe_to_pyseries, iterable_to_pyseries as iterable_to_pyseries, numpy_to_idxs as numpy_to_idxs, numpy_to_pyseries as numpy_to_pyseries, pandas_to_pyseries as pandas_to_pyseries, sequence_to_pyseries as sequence_to_pyseries, series_to_pyseries as series_to_pyseries +from polars.utils._wrap import wrap_df as wrap_df +from polars.utils.convert import _date_to_pl_date as _date_to_pl_date, _datetime_to_pl_timestamp as _datetime_to_pl_timestamp, _time_to_pl_time as _time_to_pl_time, _timedelta_to_pl_timedelta as _timedelta_to_pl_timedelta +from polars.utils.deprecation import deprecate_function as deprecate_function, deprecate_nonkeyword_arguments as deprecate_nonkeyword_arguments, deprecate_renamed_function as deprecate_renamed_function, deprecate_renamed_parameter as deprecate_renamed_parameter, issue_deprecation_warning as issue_deprecation_warning +from polars.utils.meta import get_index_type as get_index_type +from polars.utils.unstable import unstable as unstable +from polars.utils.various import _is_generator as _is_generator, no_default as no_default, parse_version as parse_version, range_to_slice as range_to_slice, scale_bytes as scale_bytes, sphinx_accessor as sphinx_accessor, warn_null_comparison as warn_null_comparison +from typing import Any, ArrayLike, Callable, ClassVar as _ClassVar, Collection, Generator, Iterable, Mapping, NoReturn, Sequence + +TYPE_CHECKING: bool +_HVPLOT_AVAILABLE: bool +_PYARROW_AVAILABLE: bool + +class Series: + _s: _ClassVar[None] = ... + _accessors: _ClassVar[set] = ... + def __init__(self, name: str | ArrayLike | None = ..., values: ArrayLike | None = ..., dtype: PolarsDataType | None = ...) -> None: ... + @classmethod + def _from_pyseries(cls, pyseries: PySeries) -> Self: ... + @classmethod + def _from_arrow(cls, name: str, values: pa.Array) -> Self: + """Construct a Series from an Arrow Array.""" + @classmethod + def _from_pandas(cls, name: str, values: pd.Series[Any] | pd.DatetimeIndex) -> Self: + """Construct a Series from a pandas Series or DatetimeIndex.""" + def _get_buffer_info(self) -> BufferInfo: + """ + Return pointer, offset, and length information about the underlying buffer. + + Returns + ------- + tuple of ints + Tuple of the form (pointer, offset, length) + + Raises + ------ + TypeError + If the `Series` data type is not physical. + ComputeError + If the `Series` contains multiple chunks. + + Notes + ----- + This method is mainly intended for use with the dataframe interchange protocol. + """ + def _get_buffers(self) -> SeriesBuffers: + ''' + Return the underlying values, validity, and offsets buffers as Series. + + The values buffer always exists. + The validity buffer may not exist if the column contains no null values. + The offsets buffer only exists for Series of data type `String` and `List`. + + Returns + ------- + dict + Dictionary with `"values"`, `"validity"`, and `"offsets"` keys mapping + to the corresponding buffer or `None` if the buffer doesn\'t exist. + + Warnings + -------- + The underlying buffers for `String` Series cannot be represented in this + format. Instead, the buffers are converted to a values and offsets buffer. + + Notes + ----- + This method is mainly intended for use with the dataframe interchange protocol. + ''' + def _from_buffer(self, dtype: PolarsDataType, buffer_info: BufferInfo, owner: Any) -> Self: + """ + Construct a Series from information about its underlying buffer. + + Parameters + ---------- + dtype + The data type of the buffer. + Must be a physical type (integer, float, or boolean). + buffer_info + Tuple containing buffer information in the form `(pointer, offset, length)`. + owner + The object owning the buffer. + + Returns + ------- + Series + + Raises + ------ + TypeError + When the given `dtype` is not supported. + + Notes + ----- + This method is mainly intended for use with the dataframe interchange protocol. + """ + def _from_buffers(self, dtype: PolarsDataType, data: Series | Sequence[Series], validity: Series | None = ...) -> Self: + """ + Construct a Series from information about its underlying buffers. + + Parameters + ---------- + dtype + The data type of the resulting Series. + data + Buffers describing the data. For most data types, this is a single Series of + the physical data type of `dtype`. Some data types require multiple buffers: + + - `String`: A data buffer of type `UInt8` and an offsets buffer + of type `Int64`. Note that this does not match how the data + is represented internally and data copy is required to construct + the Series. + validity + Validity buffer. If specified, must be a Series of data type `Boolean`. + + Returns + ------- + Series + + Raises + ------ + TypeError + When the given `dtype` is not supported or the other inputs do not match + the requirements for constructing a Series of the given `dtype`. + + Warnings + -------- + Constructing a `String` Series requires specifying a values and offsets buffer, + which does not match the actual underlying buffers. The values and offsets + buffer are converted into the actual buffers, which copies data. + + Notes + ----- + This method is mainly intended for use with the dataframe interchange protocol. + """ + def __bool__(self) -> NoReturn: ... + def __len__(self) -> int: ... + def __and__(self, other: Series) -> Self: ... + def __rand__(self, other: Series) -> Series: ... + def __or__(self, other: Series) -> Self: ... + def __ror__(self, other: Series) -> Series: ... + def __xor__(self, other: Series) -> Self: ... + def __rxor__(self, other: Series) -> Series: ... + def _comp(self, other: Any, op: ComparisonOperator) -> Series: ... + def __eq__(self, other: Any) -> Series | Expr: ... + def __ne__(self, other: Any) -> Series | Expr: ... + def __gt__(self, other: Any) -> Series | Expr: ... + def __lt__(self, other: Any) -> Series | Expr: ... + def __ge__(self, other: Any) -> Series | Expr: ... + def __le__(self, other: Any) -> Series | Expr: ... + def le(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series <= other`.""" + def lt(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series < other`.""" + def eq(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series == other`.""" + def eq_missing(self, other: Any) -> Series | Expr: + ''' + Method equivalent of equality operator `series == other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + ne_missing + eq + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.eq(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + null + ] + >>> s1.eq_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + ''' + def ne(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series != other`.""" + def ne_missing(self, other: Any) -> Series | Expr: + ''' + Method equivalent of equality operator `series != other` where `None == None`. + + This differs from the standard `ne` where null values are propagated. + + Parameters + ---------- + other + A literal or expression value to compare with. + + See Also + -------- + eq_missing + ne + + Examples + -------- + >>> s1 = pl.Series("a", [333, 200, None]) + >>> s2 = pl.Series("a", [100, 200, None]) + >>> s1.ne(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + null + ] + >>> s1.ne_missing(s2) + shape: (3,) + Series: \'a\' [bool] + [ + true + false + false + ] + ''' + def ge(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series >= other`.""" + def gt(self, other: Any) -> Series | Expr: + """Method equivalent of operator expression `series > other`.""" + def _arithmetic(self, other: Any, op_s: str, op_ffi: str) -> Self: ... + def __add__(self, other: Any) -> Self | DataFrame | Expr: ... + def __sub__(self, other: Any) -> Self | Expr: ... + def __truediv__(self, other: Any) -> Series | Expr: ... + def __floordiv__(self, other: Any) -> Series | Expr: ... + def __invert__(self) -> Series: ... + def __mul__(self, other: Any) -> Series | DataFrame | Expr: ... + def __mod__(self, other: Any) -> Series | Expr: ... + def __rmod__(self, other: Any) -> Series: ... + def __radd__(self, other: Any) -> Series: ... + def __rsub__(self, other: Any) -> Series: ... + def __rtruediv__(self, other: Any) -> Series: ... + def __rfloordiv__(self, other: Any) -> Series: ... + def __rmul__(self, other: Any) -> Series: ... + def __pow__(self, exponent: int | float | Series) -> Series: ... + def __rpow__(self, other: Any) -> Series: ... + def __matmul__(self, other: Any) -> float | Series | None: ... + def __rmatmul__(self, other: Any) -> float | Series | None: ... + def __neg__(self) -> Series: ... + def __pos__(self) -> Series: ... + def __abs__(self) -> Series: ... + def __copy__(self) -> Self: ... + def __deepcopy__(self, memo: None = ...) -> Self: ... + def __contains__(self, item: Any) -> bool: ... + def __iter__(self) -> Generator[Any, None, None]: ... + def _pos_idxs(self, size: int) -> Series: ... + def _take_with_series(self, s: Series) -> Series: ... + def __getitem__(self, item: int | Series | range | slice | np.ndarray[Any, Any] | list[int]) -> Any: ... + def __setitem__(self, key: int | Series | np.ndarray[Any, Any] | Sequence[object] | tuple[object], value: Any) -> None: ... + def __array__(self, dtype: Any = ...) -> np.ndarray[Any, Any]: + """ + Numpy __array__ interface protocol. + + Ensures that `np.asarray(pl.Series(..))` works as expected, see + https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method. + """ + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) -> Series: + """Numpy universal functions.""" + def __column_consortium_standard__(self) -> Any: + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of polars. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + def _repr_html_(self) -> str: + """Format output data in HTML for display in Jupyter Notebooks.""" + def item(self, index: int | None = ...) -> Any: + ''' + Return the Series as a scalar, or return the element at the given index. + + If no index is provided, this is equivalent to `s[0]`, with a check + that the shape is (1,). With an index, this is equivalent to `s[index]`. + + Examples + -------- + >>> s1 = pl.Series("a", [1]) + >>> s1.item() + 1 + >>> s2 = pl.Series("a", [9, 8, 7]) + >>> s2.cum_sum().item(-1) + 24 + ''' + def estimated_size(self, unit: SizeUnit = ...) -> int | float: + ''' + Return an estimation of the total (heap) allocated size of the Series. + + Estimated size is given in the specified unit (bytes by default). + + This estimation is the sum of the size of its buffers, validity, including + nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the + size of 2 arrays is not the sum of the sizes computed from this function. In + particular, [`StructArray`]\'s size is an upper bound. + + When an array is sliced, its allocated size remains constant because the buffer + unchanged. However, this function will yield a smaller number. This is because + this function returns the visible size of the buffer, not its total capacity. + + FFI buffers are included in this estimation. + + Parameters + ---------- + unit : {\'b\', \'kb\', \'mb\', \'gb\', \'tb\'} + Scale the returned size to the given unit. + + Examples + -------- + >>> s = pl.Series("values", list(range(1_000_000)), dtype=pl.UInt32) + >>> s.estimated_size() + 4000000 + >>> s.estimated_size("mb") + 3.814697265625 + ''' + def sqrt(self) -> Series: + """ + Compute the square root of the elements. + + Syntactic sugar for + + >>> pl.Series([1, 2]) ** 0.5 + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.414214 + ] + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.sqrt() + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.414214 + 1.732051 + ] + """ + def cbrt(self) -> Series: + """ + Compute the cube root of the elements. + + Optimization for + + >>> pl.Series([1, 2]) ** (1.0 / 3) + shape: (2,) + Series: '' [f64] + [ + 1.0 + 1.259921 + ] + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.cbrt() + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.259921 + 1.44225 + ] + """ + def any(self) -> bool | None: + """ + Return whether any of the values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, False]).any() + True + >>> pl.Series([False, False]).any() + False + >>> pl.Series([None, False]).any() + False + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, False]).any(ignore_nulls=False) # Returns None + """ + def all(self) -> bool | None: + """ + Return whether all values in the column are `True`. + + Only works on columns of data type :class:`Boolean`. + + Parameters + ---------- + ignore_nulls + Ignore null values (default). + + If set to `False`, `Kleene logic`_ is used to deal with nulls: + if the column contains any null values and no `True` values, + the output is `None`. + + .. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic + + Returns + ------- + bool or None + + Examples + -------- + >>> pl.Series([True, True]).all() + True + >>> pl.Series([False, True]).all() + False + >>> pl.Series([None, True]).all() + True + + Enable Kleene logic by setting `ignore_nulls=False`. + + >>> pl.Series([None, True]).all(ignore_nulls=False) # Returns None + """ + def log(self, base: float = ...) -> Series: + """ + Compute the logarithm to a given base. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.log() + shape: (3,) + Series: '' [f64] + [ + 0.0 + 0.693147 + 1.098612 + ] + """ + def log1p(self) -> Series: + """ + Compute the natural logarithm of the input array plus one, element-wise. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.log1p() + shape: (3,) + Series: '' [f64] + [ + 0.693147 + 1.098612 + 1.386294 + ] + """ + def log10(self) -> Series: + """ + Compute the base 10 logarithm of the input array, element-wise. + + Examples + -------- + >>> s = pl.Series([10, 100, 1000]) + >>> s.log10() + shape: (3,) + Series: '' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + """ + def exp(self) -> Series: + """ + Compute the exponential, element-wise. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.exp() + shape: (3,) + Series: '' [f64] + [ + 2.718282 + 7.389056 + 20.085537 + ] + """ + def drop_nulls(self) -> Series: + ''' + Drop all null values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nans + + Notes + ----- + A null value is not the same as a NaN value. + To drop NaN values, use :func:`drop_nans`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nulls() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + 3.0 + NaN + ] + ''' + def drop_nans(self) -> Series: + ''' + Drop all floating point NaN values. + + The original order of the remaining elements is preserved. + + See Also + -------- + drop_nulls + + Notes + ----- + A NaN value is not the same as a null value. + To drop null values, use :func:`drop_nulls`. + + Examples + -------- + >>> s = pl.Series([1.0, None, 3.0, float("nan")]) + >>> s.drop_nans() + shape: (3,) + Series: \'\' [f64] + [ + 1.0 + null + 3.0 + ] + ''' + def to_frame(self, name: str | None = ...) -> DataFrame: + ''' + Cast this Series to a DataFrame. + + Parameters + ---------- + name + optionally name/rename the Series column in the new DataFrame. + + Examples + -------- + >>> s = pl.Series("a", [123, 456]) + >>> df = s.to_frame() + >>> df + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + + >>> df = s.to_frame("xyz") + >>> df + shape: (2, 1) + ┌─────┐ + │ xyz │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 123 │ + │ 456 │ + └─────┘ + ''' + def describe(self, percentiles: Sequence[float] | float | None = ..., interpolation: RollingInterpolationMethod = ...) -> DataFrame: + ''' + Quick summary statistics of a Series. + + Series with mixed datatypes will return summary statistics for the datatype of + the first value. + + Parameters + ---------- + percentiles + One or more percentiles to include in the summary statistics (if the + Series has a numeric dtype). All values must be in the range `[0, 1]`. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method used when calculating percentiles. + + Notes + ----- + The median is included by default as the 50% percentile. + + Returns + ------- + DataFrame + Mapping with summary statistics of a Series. + + Examples + -------- + >>> s = pl.Series([1, 2, 3, 4, 5]) + >>> s.describe() + shape: (9, 2) + ┌────────────┬──────────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ f64 │ + ╞════════════╪══════════╡ + │ count ┆ 5.0 │ + │ null_count ┆ 0.0 │ + │ mean ┆ 3.0 │ + │ std ┆ 1.581139 │ + │ min ┆ 1.0 │ + │ 25% ┆ 2.0 │ + │ 50% ┆ 3.0 │ + │ 75% ┆ 4.0 │ + │ max ┆ 5.0 │ + └────────────┴──────────┘ + + Non-numeric data types may not have all statistics available. + + >>> s = pl.Series(["aa", "aa", None, "bb", "cc"]) + >>> s.describe() + shape: (4, 2) + ┌────────────┬───────┐ + │ statistic ┆ value │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞════════════╪═══════╡ + │ count ┆ 4 │ + │ null_count ┆ 1 │ + │ min ┆ aa │ + │ max ┆ cc │ + └────────────┴───────┘ + ''' + def sum(self) -> int | float: + ''' + Reduce this Series to the sum value. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.sum() + 6 + ''' + def mean(self) -> PythonLiteral | None: + ''' + Reduce this Series to the mean value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.mean() + 2.0 + ''' + def product(self) -> int | float: + ''' + Reduce this Series to the product value. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.product() + 6 + ''' + def pow(self, exponent: int | float | Series) -> Series: + ''' + Raise to the power of the given exponent. + + Parameters + ---------- + exponent + The exponent. Accepts Series input. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4]) + >>> s.pow(3) + shape: (4,) + Series: \'foo\' [f64] + [ + 1.0 + 8.0 + 27.0 + 64.0 + ] + ''' + def min(self) -> PythonLiteral | None: + ''' + Get the minimal value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.min() + 1 + ''' + def max(self) -> PythonLiteral | None: + ''' + Get the maximum value in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.max() + 3 + ''' + def nan_max(self) -> int | float | date | datetime | timedelta | str: + ''' + Get maximum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4]) + >>> s.nan_max() + 4 + + >>> s = pl.Series("a", [1, float("nan"), 4]) + >>> s.nan_max() + nan + ''' + def nan_min(self) -> int | float | date | datetime | timedelta | str: + ''' + Get minimum value, but propagate/poison encountered NaN values. + + This differs from numpy\'s `nanmax` as numpy defaults to propagating NaN values, + whereas polars defaults to ignoring them. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4]) + >>> s.nan_min() + 1 + + >>> s = pl.Series("a", [1, float("nan"), 4]) + >>> s.nan_min() + nan + ''' + def std(self, ddof: int = ...) -> float | None: + ''' + Get the standard deviation of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.std() + 1.0 + ''' + def var(self, ddof: int = ...) -> float | None: + ''' + Get variance of this Series. + + Parameters + ---------- + ddof + “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, + where N represents the number of elements. + By default ddof is 1. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.var() + 1.0 + ''' + def median(self) -> PythonLiteral | None: + ''' + Get the median of this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.median() + 2.0 + ''' + def quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ...) -> float | None: + ''' + Get the quantile value of this Series. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.quantile(0.5) + 2.0 + ''' + def to_dummies(self, separator: str = ...) -> DataFrame: + ''' + Get dummy/indicator variables. + + Parameters + ---------- + separator + Separator/delimiter used when generating column names. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_dummies() + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a_1 ┆ a_2 ┆ a_3 │ + │ --- ┆ --- ┆ --- │ + │ u8 ┆ u8 ┆ u8 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 0 ┆ 0 │ + │ 0 ┆ 1 ┆ 0 │ + │ 0 ┆ 0 ┆ 1 │ + └─────┴─────┴─────┘ + ''' + def cut(self, breaks: Sequence[float]) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + breaks + List of unique cut points. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + left_closed + Set the intervals to be left-closed instead of right-closed. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + qcut + + Examples + -------- + Divide the column into three categories. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.cut([-1, 1], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.cut([-1, 1], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + ''' + def qcut(self, quantiles: Sequence[float] | int) -> Series | DataFrame: + ''' + Bin continuous values into discrete categories based on their quantiles. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + quantiles + Either a list of quantile probabilities between 0 and 1 or a positive + integer determining the number of bins with uniform probability. + labels + Names of the categories. The number of labels must be equal to the number + of cut points plus one. + left_closed + Set the intervals to be left-closed instead of right-closed. + allow_duplicates + If set to `True`, duplicates in the resulting quantiles are dropped, + rather than raising a `DuplicateError`. This can happen even with unique + probabilities, depending on the data. + include_breaks + Include a column with the right endpoint of the bin each observation falls + in. This will change the data type of the output from a + :class:`Categorical` to a :class:`Struct`. + break_point_label + Name of the breakpoint column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + category_label + Name of the category column. Only used if `include_breaks` is set to + `True`. + + .. deprecated:: 0.19.0 + This parameter will be removed. Use `Series.struct.rename_fields` to + rename the field instead. + as_series + If set to `False`, return a DataFrame containing the original values, + the breakpoints, and the categories. + + .. deprecated:: 0.19.0 + This parameter will be removed. The same behavior can be achieved by + setting `include_breaks=True`, unnesting the resulting struct Series, + and adding the result to the original Series. + + Returns + ------- + Series + Series of data type :class:`Categorical` if `include_breaks` is set to + `False` (default), otherwise a Series of data type :class:`Struct`. + + See Also + -------- + cut + + Examples + -------- + Divide a column into three categories according to pre-defined quantile + probabilities. + + >>> s = pl.Series("foo", [-2, -1, 0, 1, 2]) + >>> s.qcut([0.25, 0.75], labels=["a", "b", "c"]) + shape: (5,) + Series: \'foo\' [cat] + [ + "a" + "a" + "b" + "b" + "c" + ] + + Divide a column into two categories using uniform quantile probabilities. + + >>> s.qcut(2, labels=["low", "high"], left_closed=True) + shape: (5,) + Series: \'foo\' [cat] + [ + "low" + "low" + "high" + "high" + "high" + ] + + Create a DataFrame with the breakpoint and category for each value. + + >>> cut = s.qcut([0.25, 0.75], include_breaks=True).alias("cut") + >>> s.to_frame().with_columns(cut).unnest("cut") + shape: (5, 3) + ┌─────┬─────────────┬────────────┐ + │ foo ┆ break_point ┆ category │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ cat │ + ╞═════╪═════════════╪════════════╡ + │ -2 ┆ -1.0 ┆ (-inf, -1] │ + │ -1 ┆ -1.0 ┆ (-inf, -1] │ + │ 0 ┆ 1.0 ┆ (-1, 1] │ + │ 1 ┆ 1.0 ┆ (-1, 1] │ + │ 2 ┆ inf ┆ (1, inf] │ + └─────┴─────────────┴────────────┘ + ''' + def rle(self) -> Series: + ''' + Get the lengths and values of runs of identical values. + + Returns + ------- + Series + Series of data type :class:`Struct` with Fields "lengths" and "values". + + See Also + -------- + rle_id + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle().struct.unnest() + shape: (6, 2) + ┌─────────┬────────┐ + │ lengths ┆ values │ + │ --- ┆ --- │ + │ i32 ┆ i64 │ + ╞═════════╪════════╡ + │ 2 ┆ 1 │ + │ 1 ┆ 2 │ + │ 1 ┆ 1 │ + │ 1 ┆ null │ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────────┴────────┘ + ''' + def rle_id(self) -> Series: + ''' + Get a distinct integer ID for each run of identical values. + + The ID increases by one each time the value of a column (which can be a + :class:`Struct`) changes. + + This is especially useful when you want to define a new group for every time a + column\'s value changes, rather than for every distinct value of that column. + + Returns + ------- + Series + + See Also + -------- + rle + + Examples + -------- + >>> s = pl.Series("s", [1, 1, 2, 1, None, 1, 3, 3]) + >>> s.rle_id() + shape: (8,) + Series: \'s\' [u32] + [ + 0 + 0 + 1 + 2 + 3 + 4 + 5 + 5 + ] + ''' + def hist(self, bins: list[float] | None = ...) -> DataFrame: + ''' + Bin values into buckets and count their occurrences. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + bins + Discretizations to make. + If None given, we determine the boundaries based on the data. + bin_count + If no bins provided, this will be used to determine + the distance of the bins + include_breakpoint + Include a column that indicates the upper breakpoint. + include_category + Include a column that shows the intervals as categories. + + Returns + ------- + DataFrame + + Examples + -------- + >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + >>> a.hist(bin_count=4) + shape: (5, 3) + ┌─────────────┬─────────────┬───────┐ + │ break_point ┆ category ┆ count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞═════════════╪═════════════╪═══════╡ + │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ + │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ + │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ + │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ + │ inf ┆ (6.75, inf] ┆ 2 │ + └─────────────┴─────────────┴───────┘ + ''' + def value_counts(self) -> DataFrame: + ''' + Count the occurrences of unique values. + + Parameters + ---------- + sort + Sort the output by count in descending order. + If set to `False` (default), the order of the output is random. + parallel + Execute the computation in parallel. + + .. note:: + This option should likely not be enabled in a group by context, + as the computation is already parallelized per group. + + Returns + ------- + DataFrame + Mapping of unique values to their count. + + Examples + -------- + >>> s = pl.Series("color", ["red", "blue", "red", "green", "blue", "blue"]) + >>> s.value_counts() # doctest: +IGNORE_RESULT + shape: (3, 2) + ┌───────┬───────┐ + │ color ┆ count │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═══════╡ + │ red ┆ 2 │ + │ green ┆ 1 │ + │ blue ┆ 3 │ + └───────┴───────┘ + + Sort the output by count. + + >>> s.value_counts(sort=True) + shape: (3, 2) + ┌───────┬───────┐ + │ color ┆ count │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═══════╡ + │ blue ┆ 3 │ + │ red ┆ 2 │ + │ green ┆ 1 │ + └───────┴───────┘ + ''' + def unique_counts(self) -> Series: + ''' + Return a count of the unique values in the order of appearance. + + Examples + -------- + >>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"]) + >>> s.unique_counts() + shape: (3,) + Series: \'id\' [u32] + [ + 1 + 2 + 3 + ] + ''' + def entropy(self, base: float = ...) -> float | None: + """ + Computes the entropy. + + Uses the formula `-sum(pk * log(pk)` where `pk` are discrete probabilities. + + Parameters + ---------- + base + Given base, defaults to `e` + normalize + Normalize pk if it doesn't sum to 1. + + Examples + -------- + >>> a = pl.Series([0.99, 0.005, 0.005]) + >>> a.entropy(normalize=True) + 0.06293300616044681 + >>> b = pl.Series([0.65, 0.10, 0.25]) + >>> b.entropy(normalize=True) + 0.8568409950394724 + """ + def cumulative_eval(self, expr: Expr, min_periods: int = ...) -> Series: + ''' + Run an expression over a sliding window that increases `1` slot every iteration. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + expr + Expression to evaluate + min_periods + Number of valid values there should be in the window before the expression + is evaluated. valid values = `length - null_count` + parallel + Run in parallel. Don\'t do this in a group by or another operation that + already has much parallelization. + + Warnings + -------- + This can be really slow as it can have `O(n^2)` complexity. Don\'t use this + for operations that visit all elements. + + Examples + -------- + >>> s = pl.Series("values", [1, 2, 3, 4, 5]) + >>> s.cumulative_eval(pl.element().first() - pl.element().last() ** 2) + shape: (5,) + Series: \'values\' [f64] + [ + 0.0 + -3.0 + -8.0 + -15.0 + -24.0 + ] + ''' + def alias(self, name: str) -> Series: + ''' + Rename the series. + + Parameters + ---------- + name + The new name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.alias("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + ''' + def rename(self, name: str) -> Series: + ''' + Rename this Series. + + Alias for :func:`Series.alias`. + + Parameters + ---------- + name + New name. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.rename("b") + shape: (3,) + Series: \'b\' [i64] + [ + 1 + 2 + 3 + ] + ''' + def chunk_lengths(self) -> list[int]: + ''' + Get the length of each individual chunk. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).chunk_lengths() + [6] + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).chunk_lengths() + [3, 3] + ''' + def n_chunks(self) -> int: + ''' + Get the number of chunks that this Series contains. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.n_chunks() + 1 + >>> s2 = pl.Series("a", [4, 5, 6]) + + Concatenate Series with rechunk = True + + >>> pl.concat([s, s2]).n_chunks() + 1 + + Concatenate Series with rechunk = False + + >>> pl.concat([s, s2], rechunk=False).n_chunks() + 2 + ''' + def cum_max(self) -> Series: + ''' + Get an array with the cumulative max computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [3, 5, 1]) + >>> s.cum_max() + shape: (3,) + Series: \'s\' [i64] + [ + 3 + 5 + 5 + ] + ''' + def cum_min(self) -> Series: + ''' + Get an array with the cumulative min computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Examples + -------- + >>> s = pl.Series("s", [1, 2, 3]) + >>> s.cum_min() + shape: (3,) + Series: \'s\' [i64] + [ + 1 + 1 + 1 + ] + ''' + def cum_prod(self) -> Series: + ''' + Get an array with the cumulative product computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_prod() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 6 + ] + ''' + def cum_sum(self) -> Series: + ''' + Get an array with the cumulative sum computed at every element. + + Parameters + ---------- + reverse + reverse the operation. + + Notes + ----- + Dtypes in {Int8, UInt8, Int16, UInt16} are cast to + Int64 before summing to prevent overflow issues. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.cum_sum() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 3 + 6 + ] + ''' + def cum_count(self) -> Self: + ''' + Return the cumulative count of the non-null values in the column. + + Parameters + ---------- + reverse + Reverse the operation. + + Examples + -------- + >>> s = pl.Series(["x", "k", None, "d"]) + >>> s.cum_count() + shape: (4,) + Series: \'\' [u32] + [ + 1 + 2 + 2 + 3 + ] + ''' + def slice(self, offset: int, length: int | None = ...) -> Series: + ''' + Get a slice of this Series. + + Parameters + ---------- + offset + Start index. Negative indexing is supported. + length + Length of the slice. If set to `None`, all rows starting at the offset + will be selected. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.slice(1, 2) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 3 + ] + ''' + def append(self, other: Series) -> Self: + ''' + Append a Series to this one. + + The resulting series will consist of multiple chunks. + + Parameters + ---------- + other + Series to append. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + extend + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.append(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of multiple chunks. + + >>> a.n_chunks() + 2 + ''' + def extend(self, other: Series) -> Self: + ''' + Extend the memory backed by this Series with the values from another. + + Different from `append`, which adds the chunks from `other` to the chunks of + this series, `extend` appends the data from `other` to the underlying memory + locations and thus may cause a reallocation (which is expensive). + + If this does `not` cause a reallocation, the resulting data structure will not + have any extra chunks and thus will yield faster queries. + + Prefer `extend` over `append` when you want to do a query after a single + append. For instance, during online operations where you add `n` rows + and rerun a query. + + Prefer `append` over `extend` when you want to append many times + before doing a query. For instance, when you read in multiple files and want + to store them in a single `Series`. In the latter case, finish the sequence + of `append` operations with a `rechunk`. + + Parameters + ---------- + other + Series to extend the series with. + + Warnings + -------- + This method modifies the series in-place. The series is returned for + convenience only. + + See Also + -------- + append + + Examples + -------- + >>> a = pl.Series("a", [1, 2, 3]) + >>> b = pl.Series("b", [4, 5]) + >>> a.extend(b) + shape: (5,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + 5 + ] + + The resulting series will consist of a single chunk. + + >>> a.n_chunks() + 1 + ''' + def filter(self, predicate: Series | list[bool]) -> Self: + ''' + Filter elements by a boolean mask. + + The original order of the remaining elements is preserved. + + Parameters + ---------- + predicate + Boolean mask. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> mask = pl.Series("", [True, False, True]) + >>> s.filter(mask) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + ''' + def head(self, n: int = ...) -> Series: + ''' + Get the first `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + tail, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.head(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + + Pass a negative value to get all rows `except` the last `abs(n)`. + + >>> s.head(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 2 + ] + ''' + def tail(self, n: int = ...) -> Series: + ''' + Get the last `n` elements. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the first `abs(n)`. + + See Also + -------- + head, slice + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.tail(3) + shape: (3,) + Series: \'a\' [i64] + [ + 3 + 4 + 5 + ] + + Pass a negative value to get all rows `except` the first `abs(n)`. + + >>> s.tail(-3) + shape: (2,) + Series: \'a\' [i64] + [ + 4 + 5 + ] + ''' + def limit(self, n: int = ...) -> Series: + """ + Get the first `n` elements. + + Alias for :func:`Series.head`. + + Parameters + ---------- + n + Number of elements to return. If a negative value is passed, return all + elements except the last `abs(n)`. + + See Also + -------- + head + """ + def gather_every(self, n: int, offset: int = ...) -> Series: + ''' + Take every nth value in the Series and return as new Series. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Start the row index at this offset. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather_every(2) + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 3 + ] + >>> s.gather_every(2, offset=1) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + ''' + def sort(self) -> Self: + ''' + Sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + in_place + Sort in-place. + + Examples + -------- + >>> s = pl.Series("a", [1, 3, 4, 2]) + >>> s.sort() + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 4 + ] + >>> s.sort(descending=True) + shape: (4,) + Series: \'a\' [i64] + [ + 4 + 3 + 2 + 1 + ] + ''' + def top_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` largest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + bottom_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.top_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 5 + 4 + 3 + ] + ''' + def bottom_k(self, k: int | IntoExprColumn = ...) -> Series: + ''' + Return the `k` smallest elements. + + This has time complexity: + + .. math:: O(n + k \\\\log{}n - \\frac{k}{2}) + + Parameters + ---------- + k + Number of elements to return. + + See Also + -------- + top_k + + Examples + -------- + >>> s = pl.Series("a", [2, 5, 1, 4, 3]) + >>> s.bottom_k(3) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + ''' + def arg_sort(self) -> Series: + ''' + Get the index values that would sort this Series. + + Parameters + ---------- + descending + Sort in descending order. + nulls_last + Place null values last instead of first. + + Examples + -------- + >>> s = pl.Series("a", [5, 3, 4, 1, 2]) + >>> s.arg_sort() + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 0 + ] + ''' + def arg_unique(self) -> Series: + ''' + Get unique index as Series. + + Returns + ------- + Series + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.arg_unique() + shape: (3,) + Series: \'a\' [u32] + [ + 0 + 1 + 3 + ] + ''' + def arg_min(self) -> int | None: + ''' + Get the index of the minimal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_min() + 2 + ''' + def arg_max(self) -> int | None: + ''' + Get the index of the maximal value. + + Returns + ------- + int + + Examples + -------- + >>> s = pl.Series("a", [3, 2, 1]) + >>> s.arg_max() + 0 + ''' + def search_sorted(self, element: int | float | Series | np.ndarray[Any, Any] | list[int] | list[float], side: SearchSortedSide = ...) -> int | Series: + """ + Find indices where elements should be inserted to maintain order. + + .. math:: a[i-1] < v <= a[i] + + Parameters + ---------- + element + Expression or scalar value. + side : {'any', 'left', 'right'} + If 'any', the index of the first suitable location found is given. + If 'left', the index of the leftmost suitable location found is given. + If 'right', return the rightmost suitable location found is given. + """ + def unique(self) -> Series: + ''' + Get unique elements in series. + + Parameters + ---------- + maintain_order + Maintain order of data. This requires more work. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.unique().sort() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + ''' + def gather(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + ''' + Take values by index. + + Parameters + ---------- + indices + Index location used for selection. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4]) + >>> s.gather([1, 3]) + shape: (2,) + Series: \'a\' [i64] + [ + 2 + 4 + ] + ''' + def null_count(self) -> int: + """ + Count the null values in this Series. + + Examples + -------- + >>> s = pl.Series([1, None, None]) + >>> s.null_count() + 2 + """ + def has_validity(self) -> bool: + """ + Return True if the Series has a validity bitmask. + + If there is no mask, it means that there are no `null` values. + + Notes + ----- + While the *absence* of a validity bitmask guarantees that a Series does not + have `null` values, the converse is not true, eg: the *presence* of a + bitmask does not mean that there are null values, as every value of the + bitmask could be `false`. + + To confirm that a column has `null` values use :func:`null_count`. + """ + def is_empty(self) -> bool: + ''' + Check if the Series is empty. + + Examples + -------- + >>> s = pl.Series("a", [], dtype=pl.Float32) + >>> s.is_empty() + True + ''' + def is_sorted(self) -> bool: + """ + Check if the Series is sorted. + + Parameters + ---------- + descending + Check if the Series is sorted in descending order + + Examples + -------- + >>> s = pl.Series([1, 3, 2]) + >>> s.is_sorted() + False + + >>> s = pl.Series([3, 2, 1]) + >>> s.is_sorted(descending=True) + True + """ + def not_(self) -> Series: + ''' + Negate a boolean Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [True, False, False]) + >>> s.not_() + shape: (3,) + Series: \'a\' [bool] + [ + false + true + true + ] + ''' + def is_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_null() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + ''' + def is_not_null(self) -> Series: + ''' + Returns a boolean Series indicating which values are not null. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, None]) + >>> s.is_not_null() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + ''' + def is_finite(self) -> Series: + ''' + Returns a boolean Series indicating which values are finite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_finite() + shape: (3,) + Series: \'a\' [bool] + [ + true + true + false + ] + ''' + def is_infinite(self) -> Series: + ''' + Returns a boolean Series indicating which values are infinite. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, np.inf]) + >>> s.is_infinite() + shape: (3,) + Series: \'a\' [bool] + [ + false + false + true + ] + ''' + def is_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_nan() + shape: (4,) + Series: \'a\' [bool] + [ + false + false + false + true + ] + ''' + def is_not_nan(self) -> Series: + ''' + Returns a boolean Series indicating which values are not NaN. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> import numpy as np + >>> s = pl.Series("a", [1.0, 2.0, 3.0, np.nan]) + >>> s.is_not_nan() + shape: (4,) + Series: \'a\' [bool] + [ + true + true + true + false + ] + ''' + def is_in(self, other: Series | Collection[Any]) -> Series: + ''' + Check if elements of this Series are in the other Series. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [2, 4]) + >>> s2.is_in(s) + shape: (2,) + Series: \'b\' [bool] + [ + true + false + ] + + >>> # check if some values are a member of sublists + >>> sets = pl.Series("sets", [[1, 2, 3], [1, 2], [9, 10]]) + >>> optional_members = pl.Series("optional_members", [1, 2, 3]) + >>> print(sets) + shape: (3,) + Series: \'sets\' [list[i64]] + [ + [1, 2, 3] + [1, 2] + [9, 10] + ] + >>> print(optional_members) + shape: (3,) + Series: \'optional_members\' [i64] + [ + 1 + 2 + 3 + ] + >>> optional_members.is_in(sets) + shape: (3,) + Series: \'optional_members\' [bool] + [ + true + true + false + ] + ''' + def arg_true(self) -> Series: + ''' + Get index values where Boolean Series evaluate True. + + Returns + ------- + Series + Series of data type :class:`UInt32`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> (s == 2).arg_true() + shape: (1,) + Series: \'a\' [u32] + [ + 1 + ] + ''' + def is_unique(self) -> Series: + ''' + Get mask of all unique values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_unique() + shape: (4,) + Series: \'a\' [bool] + [ + true + false + false + true + ] + ''' + def is_first_distinct(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_first_distinct() + shape: (5,) + Series: '' [bool] + [ + true + false + true + true + false + ] + """ + def is_last_distinct(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series([1, 1, 2, 3, 2]) + >>> s.is_last_distinct() + shape: (5,) + Series: '' [bool] + [ + false + true + false + true + true + ] + """ + def is_duplicated(self) -> Series: + ''' + Get mask of all duplicated values. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.is_duplicated() + shape: (4,) + Series: \'a\' [bool] + [ + false + true + true + false + ] + ''' + def explode(self) -> Series: + """ + Explode a list Series. + + This means that every item is expanded to a new row. + + Returns + ------- + Series + Series with the data type of the list elements. + + See Also + -------- + Series.list.explode : Explode a list column. + Series.str.explode : Explode a string column. + """ + def equals(self, other: Series) -> bool: + ''' + Check whether the Series is equal to another Series. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don\'t allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + + See Also + -------- + assert_series_equal + + Examples + -------- + >>> s1 = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4, 5, 6]) + >>> s1.equals(s1) + True + >>> s1.equals(s2) + False + ''' + def cast(self, dtype: PolarsDataType | type[int] | type[float] | type[str] | type[bool]) -> Self: + ''' + Cast between data types. + + Parameters + ---------- + dtype + DataType to cast to. + strict + Throw an error if a cast could not be done (for instance, due to an + overflow). + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s + shape: (3,) + Series: \'a\' [bool] + [ + true + false + true + ] + + >>> s.cast(pl.UInt32) + shape: (3,) + Series: \'a\' [u32] + [ + 1 + 0 + 1 + ] + ''' + def to_physical(self) -> Series: + ''' + Cast to physical representation of the logical dtype. + + - :func:`polars.datatypes.Date` -> :func:`polars.datatypes.Int32` + - :func:`polars.datatypes.Datetime` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Time` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Duration` -> :func:`polars.datatypes.Int64` + - :func:`polars.datatypes.Categorical` -> :func:`polars.datatypes.UInt32` + - `List(inner)` -> `List(physical of inner)` + - Other data types will be left unchanged. + + Examples + -------- + Replicating the pandas + `pd.Series.factorize + `_ + method. + + >>> s = pl.Series("values", ["a", None, "x", "a"]) + >>> s.cast(pl.Categorical).to_physical() + shape: (4,) + Series: \'values\' [u32] + [ + 0 + null + 1 + 0 + ] + ''' + def to_list(self) -> list[Any]: + ''' + Convert this Series to a Python List. This operation clones data. + + Parameters + ---------- + use_pyarrow + Use pyarrow for the conversion. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_list() + [1, 2, 3] + >>> type(s.to_list()) + + ''' + def rechunk(self) -> Self: + """ + Create a single chunk of memory for this Series. + + Parameters + ---------- + in_place + In place or not. + """ + def reverse(self) -> Series: + ''' + Return Series in reverse order. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.Int8) + >>> s.reverse() + shape: (3,) + Series: \'a\' [i8] + [ + 3 + 2 + 1 + ] + ''' + def is_between(self, lower_bound: IntoExpr, upper_bound: IntoExpr, closed: ClosedInterval = ...) -> Series: + ''' + Get a boolean mask of the values that are between the given lower/upper bounds. + + Parameters + ---------- + lower_bound + Lower bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + upper_bound + Upper bound value. Accepts expression input. Non-expression inputs + (including strings) are parsed as literals. + closed : {\'both\', \'left\', \'right\', \'none\'} + Define which sides of the interval are closed (inclusive). + + Examples + -------- + >>> s = pl.Series("num", [1, 2, 3, 4, 5]) + >>> s.is_between(2, 4) + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + true + false + ] + + Use the `closed` argument to include or exclude the values at the bounds: + + >>> s.is_between(2, 4, closed="left") + shape: (5,) + Series: \'num\' [bool] + [ + false + true + true + false + false + ] + + You can also use strings as well as numeric/temporal values: + + >>> s = pl.Series("s", ["a", "b", "c", "d", "e"]) + >>> s.is_between("b", "d", closed="both") + shape: (5,) + Series: \'s\' [bool] + [ + false + true + true + true + false + ] + ''' + def to_numpy(self, *args: Any) -> np.ndarray[Any, Any]: + ''' + Convert this Series to numpy. + + This operation may clone data but is completely safe. Note that: + + - data which is purely numeric AND without null values is not cloned; + - floating point `nan` values can be zero-copied; + - booleans can\'t be zero-copied. + + To ensure that no data is cloned, set `zero_copy_only=True`. + + Parameters + ---------- + *args + args will be sent to pyarrow.Array.to_numpy. + zero_copy_only + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + use_pyarrow + Use `pyarrow.Array.to_numpy + `_ + + for the conversion to numpy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> arr = s.to_numpy() + >>> arr # doctest: +IGNORE_RESULT + array([1, 2, 3], dtype=int64) + >>> type(arr) + + ''' + def _view(self) -> SeriesView: + ''' + Get a view into this Series data with a numpy array. + + This operation doesn\'t clone data, but does not include missing values. + + Returns + ------- + SeriesView + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + + Examples + -------- + >>> s = pl.Series("a", [1, None]) + >>> s._view(ignore_nulls=True) + SeriesView([1, 0]) + ''' + def to_arrow(self) -> pa.Array: + ''' + Get the underlying Arrow Array. + + If the Series contains only a single chunk this operation is zero copy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s = s.to_arrow() + >>> s # doctest: +ELLIPSIS + + [ + 1, + 2, + 3 + ] + ''' + def to_pandas(self, **kwargs: Any) -> pd.Series[Any]: + ''' + Convert this Series to a pandas Series. + + This operation copies data if `use_pyarrow_extension_array` is not enabled. + + Parameters + ---------- + use_pyarrow_extension_array + Use a PyArrow-backed extension array instead of a NumPy array for the pandas + Series. This allows zero copy operations and preservation of null values. + Subsequent operations on the resulting pandas Series may trigger conversion + to NumPy if those operations are not supported by PyArrow compute functions. + **kwargs + Additional keyword arguments to be passed to + :meth:`pyarrow.Array.to_pandas`. + + Returns + ------- + :class:`pandas.Series` + + Notes + ----- + This operation requires that both :mod:`pandas` and :mod:`pyarrow` are + installed. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.to_pandas() + 0 1 + 1 2 + 2 3 + Name: a, dtype: int64 + + Null values are converted to `NaN`. + + >>> s = pl.Series("b", [1, 2, None]) + >>> s.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + Name: b, dtype: float64 + + Pass `use_pyarrow_extension_array=True` to get a pandas Series backed by a + PyArrow extension array. This will preserve null values. + + >>> s.to_pandas(use_pyarrow_extension_array=True) + 0 1 + 1 2 + 2 + Name: b, dtype: int64[pyarrow] + ''' + def to_init_repr(self, n: int = ...) -> str: + ''' + Convert Series to instantiatable string representation. + + Parameters + ---------- + n + Only use first n elements. + + See Also + -------- + polars.Series.to_init_repr + polars.from_repr + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> print(s.to_init_repr()) + pl.Series("a", [1, 2, None, 4], dtype=pl.Int16) + >>> s_from_str_repr = eval(s.to_init_repr()) + >>> s_from_str_repr + shape: (4,) + Series: \'a\' [i16] + [ + 1 + 2 + null + 4 + ] + ''' + def count(self) -> int: + ''' + Return the number of non-null elements in the column. + + See Also + -------- + len + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.count() + 2 + ''' + def len(self) -> int: + ''' + Return the number of elements in the Series. + + Null values count towards the total. + + See Also + -------- + count + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None]) + >>> s.len() + 3 + ''' + def set(self, filter: Series, value: int | float | str | bool | None) -> Series: + ''' + Set masked values. + + Parameters + ---------- + filter + Boolean mask. + value + Value with which to replace the masked values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimisation (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set(s == 2, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().select( + ... pl.when(pl.col("a") == 2).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + ''' + def scatter(self, indices: Series | Iterable[int] | int | np.ndarray[Any, Any], values: Series | Iterable[PythonLiteral] | PythonLiteral | None) -> Series: + ''' + Set values at the index locations. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + + Notes + ----- + Use of this function is frequently an anti-pattern, as it can + block optimization (predicate pushdown, etc). Consider using + `pl.when(predicate).then(value).otherwise(self)` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.scatter(1, 10) + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 10 + 3 + ] + + It is better to implement this as follows: + + >>> s.to_frame().with_row_index().select( + ... pl.when(pl.col("index") == 1).then(10).otherwise(pl.col("a")) + ... ) + shape: (3, 1) + ┌─────────┐ + │ literal │ + │ --- │ + │ i64 │ + ╞═════════╡ + │ 1 │ + │ 10 │ + │ 3 │ + └─────────┘ + ''' + def clear(self, n: int = ...) -> Series: + ''' + Create an empty copy of the current Series, with zero to \'n\' elements. + + The copy has an identical name/dtype, but no data. + + Parameters + ---------- + n + Number of (empty) elements to return in the cleared frame. + + See Also + -------- + clone : Cheap deepcopy/clone. + + Examples + -------- + >>> s = pl.Series("a", [None, True, False]) + >>> s.clear() + shape: (0,) + Series: \'a\' [bool] + [ + ] + + >>> s.clear(n=2) + shape: (2,) + Series: \'a\' [bool] + [ + null + null + ] + ''' + def clone(self) -> Self: + ''' + Create a copy of this Series. + + This is a cheap operation that does not copy data. + + See Also + -------- + clear : Create an empty copy of the current Series, with identical + schema but no data. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.clone() + shape: (3,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + ] + ''' + def fill_nan(self, value: int | float | Expr | None) -> Series: + ''' + Fill floating point NaN value with a fill value. + + Parameters + ---------- + value + Value used to fill NaN values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, float("nan")]) + >>> s.fill_nan(0) + shape: (4,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 0.0 + ] + ''' + def fill_null(self, value: Any | None = ..., strategy: FillNullStrategy | None = ..., limit: int | None = ...) -> Series: + ''' + Fill null values using the specified value or strategy. + + Parameters + ---------- + value + Value used to fill null values. + strategy : {None, \'forward\', \'backward\', \'min\', \'max\', \'mean\', \'zero\', \'one\'} + Strategy used to fill null values. + limit + Number of consecutive null values to fill when using the \'forward\' or + \'backward\' strategy. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, None]) + >>> s.fill_null(strategy="forward") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 3 + ] + >>> s.fill_null(strategy="min") + shape: (4,) + Series: \'a\' [i64] + [ + 1 + 2 + 3 + 1 + ] + >>> s = pl.Series("b", ["x", None, "z"]) + >>> s.fill_null(pl.lit("")) + shape: (3,) + Series: \'b\' [str] + [ + "x" + "" + "z" + ] + ''' + def floor(self) -> Series: + ''' + Rounds down to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.floor() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + ] + ''' + def ceil(self) -> Series: + ''' + Rounds up to the nearest integer value. + + Only works on floating point Series. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.ceil() + shape: (3,) + Series: \'a\' [f64] + [ + 2.0 + 3.0 + 4.0 + ] + ''' + def round(self, decimals: int = ...) -> Series: + ''' + Round underlying floating point data by `decimals` digits. + + Examples + -------- + >>> s = pl.Series("a", [1.12345, 2.56789, 3.901234]) + >>> s.round(2) + shape: (3,) + Series: \'a\' [f64] + [ + 1.12 + 2.57 + 3.9 + ] + + Parameters + ---------- + decimals + number of decimals to round by. + ''' + def round_sig_figs(self, digits: int) -> Series: + """ + Round to a number of significant figures. + + Parameters + ---------- + digits + Number of significant figures to round to. + + Examples + -------- + >>> s = pl.Series([0.01234, 3.333, 1234.0]) + >>> s.round_sig_figs(2) + shape: (3,) + Series: '' [f64] + [ + 0.012 + 3.3 + 1200.0 + ] + """ + def dot(self, other: Series | ArrayLike) -> float | None: + ''' + Compute the dot/inner product between two Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s2 = pl.Series("b", [4.0, 5.0, 6.0]) + >>> s.dot(s2) + 32.0 + + Parameters + ---------- + other + Series (or array) to compute dot product with. + ''' + def mode(self) -> Series: + ''' + Compute the most occurring value(s). + + Can return multiple Values. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.mode() + shape: (1,) + Series: \'a\' [i64] + [ + 2 + ] + ''' + def sign(self) -> Series: + ''' + Compute the element-wise indication of the sign. + + The returned values can be -1, 0, or 1: + + * -1 if x < 0. + * 0 if x == 0. + * 1 if x > 0. + + (null values are preserved as-is). + + Examples + -------- + >>> s = pl.Series("a", [-9.0, -0.0, 0.0, 4.0, None]) + >>> s.sign() + shape: (5,) + Series: \'a\' [i64] + [ + -1 + 0 + 0 + 1 + null + ] + ''' + def sin(self) -> Series: + ''' + Compute the element-wise value for the sine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.sin() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.0 + 1.2246e-16 + ] + ''' + def cos(self) -> Series: + ''' + Compute the element-wise value for the cosine. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cos() + shape: (3,) + Series: \'a\' [f64] + [ + 1.0 + 6.1232e-17 + -1.0 + ] + ''' + def tan(self) -> Series: + ''' + Compute the element-wise value for the tangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.tan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.6331e16 + -1.2246e-16 + ] + ''' + def cot(self) -> Series: + ''' + Compute the element-wise value for the cotangent. + + Examples + -------- + >>> import math + >>> s = pl.Series("a", [0.0, math.pi / 2.0, math.pi]) + >>> s.cot() + shape: (3,) + Series: \'a\' [f64] + [ + inf + 6.1232e-17 + -8.1656e15 + ] + ''' + def arcsin(self) -> Series: + ''' + Compute the element-wise value for the inverse sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsin() + shape: (3,) + Series: \'a\' [f64] + [ + 1.570796 + 0.0 + -1.570796 + ] + ''' + def arccos(self) -> Series: + ''' + Compute the element-wise value for the inverse cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arccos() + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 1.570796 + 3.141593 + ] + ''' + def arctan(self) -> Series: + ''' + Compute the element-wise value for the inverse tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arctan() + shape: (3,) + Series: \'a\' [f64] + [ + 0.785398 + 0.0 + -0.785398 + ] + ''' + def arcsinh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.arcsinh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.881374 + 0.0 + -0.881374 + ] + ''' + def arccosh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [5.0, 1.0, 0.0, -1.0]) + >>> s.arccosh() + shape: (4,) + Series: \'a\' [f64] + [ + 2.292432 + 0.0 + NaN + NaN + ] + ''' + def arctanh(self) -> Series: + ''' + Compute the element-wise value for the inverse hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [2.0, 1.0, 0.5, 0.0, -0.5, -1.0, -1.1]) + >>> s.arctanh() + shape: (7,) + Series: \'a\' [f64] + [ + NaN + inf + 0.549306 + 0.0 + -0.549306 + -inf + NaN + ] + ''' + def sinh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic sine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.sinh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.175201 + 0.0 + -1.175201 + ] + ''' + def cosh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic cosine. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.cosh() + shape: (3,) + Series: \'a\' [f64] + [ + 1.543081 + 1.0 + 1.543081 + ] + ''' + def tanh(self) -> Series: + ''' + Compute the element-wise value for the hyperbolic tangent. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 0.0, -1.0]) + >>> s.tanh() + shape: (3,) + Series: \'a\' [f64] + [ + 0.761594 + 0.0 + -0.761594 + ] + ''' + def map_elements(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + ''' + Map a custom/user-defined function (UDF) over elements in this Series. + + .. warning:: + This method is much slower than the native expressions API. + Only use it if you cannot implement your logic otherwise. + + If the function returns a different datatype, the return_dtype arg should + be set, otherwise the method will fail. + + Implementing logic using a Python function is almost always *significantly* + slower and more memory intensive than implementing the same logic using + the native expression API because: + + - The native expression engine runs in Rust; UDFs run in Python. + - Use of Python UDFs forces the DataFrame to be materialized in memory. + - Polars-native expressions can be parallelised (UDFs typically cannot). + - Polars-native expressions can be logically optimised (UDFs cannot). + + Wherever possible you should strongly prefer the native expression API + to achieve the best performance. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + + Warnings + -------- + If `return_dtype` is not provided, this may lead to unexpected results. + We allow this, but it is considered a bug in the user\'s query. + + Notes + ----- + If your function is expensive and you don\'t want it to be called more than + once for a given input, consider applying an `@lru_cache` decorator to it. + If your data is suitable you may achieve *significant* speedups. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.map_elements(lambda x: x + 10) # doctest: +SKIP + shape: (3,) + Series: \'a\' [i64] + [ + 11 + 12 + 13 + ] + + Returns + ------- + Series + ''' + def shift(self, n: int = ...) -> Series: + """ + Shift values by the given number of indices. + + Parameters + ---------- + n + Number of indices to shift forward. If a negative value is passed, values + are shifted in the opposite direction instead. + fill_value + Fill the resulting null values with this value. Accepts expression input. + Non-expression inputs are parsed as literals. + + Notes + ----- + This method is similar to the `LAG` operation in SQL when the value for `n` + is positive. With a negative value for `n`, it is similar to `LEAD`. + + Examples + -------- + By default, values are shifted forward by one index. + + >>> s = pl.Series([1, 2, 3, 4]) + >>> s.shift() + shape: (4,) + Series: '' [i64] + [ + null + 1 + 2 + 3 + ] + + Pass a negative value to shift in the opposite direction instead. + + >>> s.shift(-2) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + null + null + ] + + Specify `fill_value` to fill the resulting null values. + + >>> s.shift(-2, fill_value=100) + shape: (4,) + Series: '' [i64] + [ + 3 + 4 + 100 + 100 + ] + """ + def zip_with(self, mask: Series, other: Series) -> Self: + """ + Take values from self or other based on the given mask. + + Where mask evaluates true, take values from self. Where mask evaluates false, + take values from other. + + Parameters + ---------- + mask + Boolean Series. + other + Series of same type. + + Returns + ------- + Series + + Examples + -------- + >>> s1 = pl.Series([1, 2, 3, 4, 5]) + >>> s2 = pl.Series([5, 4, 3, 2, 1]) + >>> s1.zip_with(s1 < s2, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 2 + 1 + ] + >>> mask = pl.Series([True, False, True, False, True]) + >>> s1.zip_with(mask, s2) + shape: (5,) + Series: '' [i64] + [ + 1 + 4 + 3 + 2 + 5 + ] + """ + def rolling_min(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling min (moving min) over the values in this array. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their min. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_min(window_size=3) + shape: (5,) + Series: \'a\' [i64] + [ + null + null + 100 + 200 + 300 + ] + ''' + def rolling_max(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling max (moving max) over the values in this array. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their max. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_max(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 200 + 300 + 400 + 500 + ] + ''' + def rolling_mean(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling mean (moving mean) over the values in this array. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their mean. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [100, 200, 300, 400, 500]) + >>> s.rolling_mean(window_size=2) + shape: (5,) + Series: \'a\' [f64] + [ + null + 150.0 + 250.0 + 350.0 + 450.0 + ] + ''' + def rolling_sum(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Apply a rolling sum (moving sum) over the values in this array. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their sum. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length of the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.rolling_sum(window_size=2) + shape: (5,) + Series: \'a\' [i64] + [ + null + 3 + 5 + 7 + 9 + ] + ''' + def rolling_std(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling std dev. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their std dev. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_std(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 1.527525 + 2.0 + ] + ''' + def rolling_var(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling variance. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + A window of length `window_size` will traverse the array. The values that fill + this window will (optionally) be multiplied with the weights given by the + `weight` vector. The resulting values will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_var(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 1.0 + 2.333333 + 4.0 + ] + ''' + def rolling_map(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a custom rolling window function. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + function + Custom aggregation function. + window_size + Size of the window. The window at a given row will include the row + itself and the `window_size - 1` elements before it. + weights + A list of weights with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window. + + Warnings + -------- + Computing custom functions is extremely slow. Use specialized rolling + functions such as :func:`Series.rolling_sum` if at all possible. + + Examples + -------- + >>> from numpy import nansum + >>> s = pl.Series([11.0, 2.0, 9.0, float("nan"), 8.0]) + >>> s.rolling_map(nansum, window_size=3) + shape: (5,) + Series: \'\' [f64] + [ + null + null + 22.0 + 11.0 + 17.0 + ] + ''' + def rolling_median(self, window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling median. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Parameters + ---------- + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_median(window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 2.0 + 3.0 + 4.0 + 6.0 + ] + ''' + def rolling_quantile(self, quantile: float, interpolation: RollingInterpolationMethod = ..., window_size: int = ..., weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + ''' + Compute a rolling quantile. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Parameters + ---------- + quantile + Quantile between 0.0 and 1.0. + interpolation : {\'nearest\', \'higher\', \'lower\', \'midpoint\', \'linear\'} + Interpolation method. + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0, 4.0, 6.0, 8.0]) + >>> s.rolling_quantile(quantile=0.33, window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.0 + 2.0 + 3.0 + 4.0 + ] + >>> s.rolling_quantile(quantile=0.33, interpolation="linear", window_size=3) + shape: (6,) + Series: \'a\' [f64] + [ + null + null + 1.66 + 2.66 + 3.66 + 5.32 + ] + ''' + def rolling_skew(self, window_size: int) -> Series: + """ + Compute a rolling skew. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + The window at a given row includes the row itself and the + `window_size - 1` elements before it. + + Parameters + ---------- + window_size + Integer size of the rolling window. + bias + If False, the calculations are corrected for statistical bias. + + Examples + -------- + >>> pl.Series([1, 4, 2, 9]).rolling_skew(3) + shape: (4,) + Series: '' [f64] + [ + null + null + 0.381802 + 0.47033 + ] + + Note how the values match + + >>> pl.Series([1, 4, 2]).skew(), pl.Series([4, 2, 9]).skew() + (0.38180177416060584, 0.47033046033698594) + """ + def sample(self, n: int | None = ...) -> Series: + ''' + Sample from this Series. + + Parameters + ---------- + n + Number of items to return. Cannot be used with `fraction`. Defaults to 1 if + `fraction` is None. + fraction + Fraction of items to return. Cannot be used with `n`. + with_replacement + Allow values to be sampled more than once. + shuffle + Shuffle the order of sampled data points. + seed + Seed for the random number generator. If set to None (default), a + random seed is generated for each sample operation. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.sample(2, seed=0) # doctest: +IGNORE_RESULT + shape: (2,) + Series: \'a\' [i64] + [ + 1 + 5 + ] + ''' + def peak_max(self) -> Self: + ''' + Get a boolean mask of the local maximum peaks. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3, 4, 5]) + >>> s.peak_max() + shape: (5,) + Series: \'a\' [bool] + [ + false + false + false + false + true + ] + ''' + def peak_min(self) -> Self: + ''' + Get a boolean mask of the local minimum peaks. + + Examples + -------- + >>> s = pl.Series("a", [4, 1, 3, 2, 5]) + >>> s.peak_min() + shape: (5,) + Series: \'a\' [bool] + [ + false + true + false + true + false + ] + ''' + def n_unique(self) -> int: + ''' + Count the number of unique values in this Series. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 2, 3]) + >>> s.n_unique() + 3 + ''' + def shrink_to_fit(self) -> Series: + """ + Shrink Series memory usage. + + Shrinks the underlying array capacity to exactly fit the actual data. + (Note that this function does not change the Series data type). + """ + def hash(self, seed: int = ..., seed_1: int | None = ..., seed_2: int | None = ..., seed_3: int | None = ...) -> Series: + ''' + Hash the Series. + + The hash value is of type `UInt64`. + + Parameters + ---------- + seed + Random seed parameter. Defaults to 0. + seed_1 + Random seed parameter. Defaults to `seed` if not set. + seed_2 + Random seed parameter. Defaults to `seed` if not set. + seed_3 + Random seed parameter. Defaults to `seed` if not set. + + Notes + ----- + This implementation of `hash` does not guarantee stable results + across different Polars versions. Its stability is only guaranteed within a + single version. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.hash(seed=42) # doctest: +IGNORE_RESULT + shape: (3,) + Series: \'a\' [u64] + [ + 10734580197236529959 + 3022416320763508302 + 13756996518000038261 + ] + ''' + def reinterpret(self) -> Series: + """ + Reinterpret the underlying bits as a signed/unsigned integer. + + This operation is only allowed for 64bit integers. For lower bits integers, + you can safely use that cast operation. + + Parameters + ---------- + signed + If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`. + """ + def interpolate(self, method: InterpolationMethod = ...) -> Series: + ''' + Fill null values using interpolation. + + Parameters + ---------- + method : {\'linear\', \'nearest\'} + Interpolation method. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, None, None, 5]) + >>> s.interpolate() + shape: (5,) + Series: \'a\' [f64] + [ + 1.0 + 2.0 + 3.0 + 4.0 + 5.0 + ] + ''' + def abs(self) -> Series: + """ + Compute absolute values. + + Same as `abs(series)`. + + Examples + -------- + >>> s = pl.Series([1, -2, -3]) + >>> s.abs() + shape: (3,) + Series: '' [i64] + [ + 1 + 2 + 3 + ] + """ + def rank(self, method: RankMethod = ...) -> Series: + ''' + Assign ranks to data, dealing with ties appropriately. + + Parameters + ---------- + method : {\'average\', \'min\', \'max\', \'dense\', \'ordinal\', \'random\'} + The method used to assign ranks to tied elements. + The following methods are available (default is \'average\'): + + - \'average\' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - \'min\' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - \'max\' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - \'dense\' : Like \'min\', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - \'ordinal\' : All values are given a distinct rank, corresponding to + the order that the values occur in the Series. + - \'random\' : Like \'ordinal\', but the rank for ties is not dependent + on the order that the values occur in the Series. + descending + Rank in descending order. + seed + If `method="random"`, use this as seed. + + Examples + -------- + The \'average\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: \'a\' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + + The \'ordinal\' method: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank("ordinal") + shape: (5,) + Series: \'a\' [u32] + [ + 3 + 4 + 1 + 2 + 5 + ] + ''' + def diff(self, n: int = ..., null_behavior: NullBehavior = ...) -> Series: + ''' + Calculate the first discrete difference between shifted items. + + Parameters + ---------- + n + Number of slots to shift. + null_behavior : {\'ignore\', \'drop\'} + How to handle null values. + + Examples + -------- + >>> s = pl.Series("s", values=[20, 10, 30, 25, 35], dtype=pl.Int8) + >>> s.diff() + shape: (5,) + Series: \'s\' [i8] + [ + null + -10 + 20 + -5 + 10 + ] + + >>> s.diff(n=2) + shape: (5,) + Series: \'s\' [i8] + [ + null + null + 10 + 15 + 5 + ] + + >>> s.diff(n=2, null_behavior="drop") + shape: (3,) + Series: \'s\' [i8] + [ + 10 + 15 + 5 + ] + ''' + def pct_change(self, n: int | IntoExprColumn = ...) -> Series: + """ + Computes percentage change between values. + + Percentage change (as fraction) between current element and most-recent + non-null element at least `n` period(s) before the current element. + + Computes the change from the previous row by default. + + Parameters + ---------- + n + periods to shift for forming percent change. + + Examples + -------- + >>> pl.Series(range(10)).pct_change() + shape: (10,) + Series: '' [f64] + [ + null + inf + 1.0 + 0.5 + 0.333333 + 0.25 + 0.2 + 0.166667 + 0.142857 + 0.125 + ] + + >>> pl.Series([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]).pct_change(2) + shape: (10,) + Series: '' [f64] + [ + null + null + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + 3.0 + ] + """ + def skew(self) -> float | None: + """ + Compute the sample skewness of a data set. + + For normally distributed data, the skewness should be about zero. For + unimodal continuous distributions, a skewness value greater than zero means + that there is more weight in the right tail of the distribution. The + function `skewtest` can be used to determine if the skewness value + is close enough to zero, statistically speaking. + + + See scipy.stats for more information. + + Parameters + ---------- + bias : bool, optional + If False, the calculations are corrected for statistical bias. + + Notes + ----- + The sample skewness is computed as the Fisher-Pearson coefficient + of skewness, i.e. + + .. math:: g_1=\\frac{m_3}{m_2^{3/2}} + + where + + .. math:: m_i=\\frac{1}{N}\\sum_{n=1}^N(x[n]-\\bar{x})^i + + is the biased sample :math:`i\\texttt{th}` central moment, and + :math:`\\bar{x}` is + the sample mean. If `bias` is False, the calculations are + corrected for bias and the value computed is the adjusted + Fisher-Pearson standardized moment coefficient, i.e. + + .. math:: + G_1 = \\frac{k_3}{k_2^{3/2}} = \\frac{\\sqrt{N(N-1)}}{N-2}\\frac{m_3}{m_2^{3/2}} + + Examples + -------- + >>> s = pl.Series([1, 2, 2, 4, 5]) + >>> s.skew() + 0.34776706224699483 + """ + def kurtosis(self) -> float | None: + """ + Compute the kurtosis (Fisher or Pearson) of a dataset. + + Kurtosis is the fourth central moment divided by the square of the + variance. If Fisher's definition is used, then 3.0 is subtracted from + the result to give 0.0 for a normal distribution. + If bias is False then the kurtosis is calculated using k statistics to + eliminate bias coming from biased moment estimators + + See scipy.stats for more information + + Parameters + ---------- + fisher : bool, optional + If True, Fisher's definition is used (normal ==> 0.0). If False, + Pearson's definition is used (normal ==> 3.0). + bias : bool, optional + If False, the calculations are corrected for statistical bias. + """ + def clip(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ..., upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn | None = ...) -> Series: + """ + Set values outside the given boundaries to the boundary value. + + Parameters + ---------- + lower_bound + Lower bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no lower bound is applied. + upper_bound + Upper bound. Accepts expression input. + Non-expression inputs are parsed as literals. + If set to `None` (default), no upper bound is applied. + + See Also + -------- + when + + Notes + ----- + This method only works for numeric and temporal columns. To clip other data + types, consider writing a `when-then-otherwise` expression. See :func:`when`. + + Examples + -------- + Specifying both a lower and upper bound: + + >>> s = pl.Series([-50, 5, 50, None]) + >>> s.clip(1, 10) + shape: (4,) + Series: '' [i64] + [ + 1 + 5 + 10 + null + ] + + Specifying only a single bound: + + >>> s.clip(upper_bound=10) + shape: (4,) + Series: '' [i64] + [ + -50 + 5 + 10 + null + ] + """ + def lower_bound(self) -> Self: + ''' + Return the lower bound of this Series\' dtype as a unit Series. + + See Also + -------- + upper_bound : return the upper bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [i32] + [ + -2147483648 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float32) + >>> s.lower_bound() + shape: (1,) + Series: \'s\' [f32] + [ + -inf + ] + ''' + def upper_bound(self) -> Self: + ''' + Return the upper bound of this Series\' dtype as a unit Series. + + See Also + -------- + lower_bound : return the lower bound of the given Series\' dtype. + + Examples + -------- + >>> s = pl.Series("s", [-1, 0, 1], dtype=pl.Int8) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [i8] + [ + 127 + ] + + >>> s = pl.Series("s", [1.0, 2.5, 3.0], dtype=pl.Float64) + >>> s.upper_bound() + shape: (1,) + Series: \'s\' [f64] + [ + inf + ] + ''' + def replace(self, old: IntoExpr | Sequence[Any] | Mapping[Any, Any], new: IntoExpr | Sequence[Any] | NoDefault = ...) -> Self: + ''' + Replace values by different values. + + Parameters + ---------- + old + Value or sequence of values to replace. + Also accepts a mapping of values to their replacement as syntactic sugar for + `replace(new=Series(mapping.keys()), old=Series(mapping.values()))`. + new + Value or sequence of values to replace by. + Length must match the length of `old` or have length 1. + default + Set values that were not replaced to this value. + Defaults to keeping the original value. + Accepts expression input. Non-expression inputs are parsed as literals. + return_dtype + The data type of the resulting Series. If set to `None` (default), + the data type is determined automatically based on the other inputs. + + See Also + -------- + str.replace + + Notes + ----- + The global string cache must be enabled when replacing categorical values. + + Examples + -------- + Replace a single value by another value. Values that were not replaced remain + unchanged. + + >>> s = pl.Series([1, 2, 2, 3]) + >>> s.replace(2, 100) + shape: (4,) + Series: \'\' [i64] + [ + 1 + 100 + 100 + 3 + ] + + Replace multiple values by passing sequences to the `old` and `new` parameters. + + >>> s.replace([2, 3], [100, 200]) + shape: (4,) + Series: \'\' [i64] + [ + 1 + 100 + 100 + 200 + ] + + Passing a mapping with replacements is also supported as syntactic sugar. + Specify a default to set all values that were not matched. + + >>> mapping = {2: 100, 3: 200} + >>> s.replace(mapping, default=-1) + shape: (4,) + Series: \'\' [i64] + [ + -1 + 100 + 100 + 200 + ] + + + The default can be another Series. + + >>> default = pl.Series([2.5, 5.0, 7.5, 10.0]) + >>> s.replace(2, 100, default=default) + shape: (4,) + Series: \'\' [f64] + [ + 2.5 + 100.0 + 100.0 + 10.0 + ] + + Replacing by values of a different data type sets the return type based on + a combination of the `new` data type and either the original data type or the + default data type if it was set. + + >>> s = pl.Series(["x", "y", "z"]) + >>> mapping = {"x": 1, "y": 2, "z": 3} + >>> s.replace(mapping) + shape: (3,) + Series: \'\' [str] + [ + "1" + "2" + "3" + ] + >>> s.replace(mapping, default=None) + shape: (3,) + Series: \'\' [i64] + [ + 1 + 2 + 3 + ] + + Set the `return_dtype` parameter to control the resulting data type directly. + + >>> s.replace(mapping, return_dtype=pl.UInt8) + shape: (3,) + Series: \'\' [u8] + [ + 1 + 2 + 3 + ] + ''' + def reshape(self, dimensions: tuple[int, ...]) -> Series: + ''' + Reshape this Series to a flat Series or a Series of Lists. + + Parameters + ---------- + dimensions + Tuple of the dimension sizes. If a -1 is used in any of the dimensions, that + dimension is inferred. + + Returns + ------- + Series + If a single dimension is given, results in a Series of the original + data type. + If a multiple dimensions are given, results in a Series of data type + :class:`List` with shape (rows, cols). + + See Also + -------- + Series.list.explode : Explode a list column. + + Examples + -------- + >>> s = pl.Series("foo", [1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> s.reshape((3, 3)) + shape: (3,) + Series: \'foo\' [list[i64]] + [ + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + ] + ''' + def shuffle(self, seed: int | None = ...) -> Series: + ''' + Shuffle the contents of this Series. + + Parameters + ---------- + seed + Seed for the random number generator. If set to None (default), a + random seed is generated each time the shuffle is called. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.shuffle(seed=1) + shape: (3,) + Series: \'a\' [i64] + [ + 2 + 1 + 3 + ] + ''' + def ewm_mean(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + """ + Exponentially-weighted moving average. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.ewm_mean(com=1) + shape: (3,) + Series: '' [f64] + [ + 1.0 + 1.666667 + 2.428571 + ] + """ + def ewm_std(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving standard deviation. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_std(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.707107 + 0.963624 + ] + ''' + def ewm_var(self, com: float | None = ..., span: float | None = ..., half_life: float | None = ..., alpha: float | None = ...) -> Series: + ''' + Exponentially-weighted moving variance. + + Parameters + ---------- + com + Specify decay in terms of center of mass, :math:`\\gamma`, with + + .. math:: + \\alpha = \\frac{1}{1 + \\gamma} \\; \\forall \\; \\gamma \\geq 0 + span + Specify decay in terms of span, :math:`\\theta`, with + + .. math:: + \\alpha = \\frac{2}{\\theta + 1} \\; \\forall \\; \\theta \\geq 1 + half_life + Specify decay in terms of half-life, :math:`\\lambda`, with + + .. math:: + \\alpha = 1 - \\exp \\left\\{ \\frac{ -\\ln(2) }{ \\lambda } \\right\\} \\; + \\forall \\; \\lambda > 0 + alpha + Specify smoothing factor alpha directly, :math:`0 < \\alpha \\leq 1`. + adjust + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings + + - When `adjust=True` the EW function is calculated + using weights :math:`w_i = (1 - \\alpha)^i` + - When `adjust=False` the EW function is calculated + recursively by + + .. math:: + y_0 &= x_0 \\\\\n y_t &= (1 - \\alpha)y_{t - 1} + \\alpha x_t + bias + When `bias=False`, apply a correction to make the estimate statistically + unbiased. + min_periods + Minimum number of observations in window required to have a value + (otherwise result is null). + ignore_nulls + Ignore missing values when calculating weights. + + - When `ignore_nulls=False` (default), weights are based on absolute + positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in + calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\\alpha)^2` and :math:`1` if `adjust=True`, and + :math:`(1-\\alpha)^2` and :math:`\\alpha` if `adjust=False`. + + - When `ignore_nulls=True`, weights are based + on relative positions. For example, the weights of + :math:`x_0` and :math:`x_2` used in calculating the final weighted + average of [:math:`x_0`, None, :math:`x_2`] are + :math:`1-\\alpha` and :math:`1` if `adjust=True`, + and :math:`1-\\alpha` and :math:`\\alpha` if `adjust=False`. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.ewm_var(com=1) + shape: (3,) + Series: \'a\' [f64] + [ + 0.0 + 0.5 + 0.928571 + ] + ''' + def extend_constant(self, value: PythonLiteral | None, n: int) -> Series: + """ + Extremely fast method for extending the Series with 'n' copies of a value. + + Parameters + ---------- + value + A constant literal value (not an expression) with which to extend + the Series; can pass None to extend with nulls. + n + The number of additional values that will be added. + + Examples + -------- + >>> s = pl.Series([1, 2, 3]) + >>> s.extend_constant(99, n=2) + shape: (5,) + Series: '' [i64] + [ + 1 + 2 + 3 + 99 + 99 + ] + """ + def set_sorted(self) -> Self: + ''' + Flags the Series as \'sorted\'. + + Enables downstream code to user fast paths for sorted arrays. + + Parameters + ---------- + descending + If the `Series` order is descending. + + Warnings + -------- + This can lead to incorrect results if this `Series` is not sorted!! + Use with care! + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.set_sorted().max() + 3 + ''' + def new_from_index(self, index: int, length: int) -> Self: + """Create a new Series filled with values from the given index.""" + def shrink_dtype(self) -> Series: + """ + Shrink numeric columns to the minimal required datatype. + + Shrink to the dtype needed to fit the extrema of this [`Series`]. + This can be used to reduce memory pressure. + """ + def get_chunks(self) -> list[Series]: + """Get the chunks of this Series as a list of Series.""" + def implode(self) -> Self: + ''' + Aggregate values into a list. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.implode() + shape: (1,) + Series: \'a\' [list[i64]] + [ + [1, 2, 3] + ] + ''' + def apply(self, function: Callable[[Any], Any], return_dtype: PolarsDataType | None = ...) -> Self: + """ + Apply a custom/user-defined function (UDF) over elements in this Series. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.map_elements`. + + Parameters + ---------- + function + Custom function or lambda. + return_dtype + Output datatype. If none is given, the same datatype as this Series will be + used. + skip_nulls + Nulls will be skipped and not passed to the python function. + This is faster because python can be skipped and because we call + more specialized functions. + """ + def rolling_apply(self, function: Callable[[Series], Any], window_size: int, weights: list[float] | None = ..., min_periods: int | None = ...) -> Series: + """ + Apply a custom rolling window function. + + .. deprecated:: 0.19.0 + This method has been renamed to :func:`Series.rolling_map`. + + Parameters + ---------- + function + Aggregation function + window_size + The length of the window. + weights + An optional slice with the same length as the window that will be multiplied + elementwise with the values in the window. + min_periods + The number of values in the window that should be non-null before computing + a result. If None, it will be set equal to: + + - the window size, if `window_size` is a fixed integer + - 1, if `window_size` is a dynamic temporal size + center + Set the labels at the center of the window + """ + def is_first(self) -> Series: + """ + Return a boolean mask indicating the first occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_first_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + """ + def is_last(self) -> Series: + """ + Return a boolean mask indicating the last occurrence of each distinct value. + + .. deprecated:: 0.19.3 + This method has been renamed to :func:`Series.is_last_distinct`. + + Returns + ------- + Series + Series of data type :class:`Boolean`. + """ + def clip_min(self, lower_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `min` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + lower_bound + Lower bound. + """ + def clip_max(self, upper_bound: NumericLiteral | TemporalLiteral | IntoExprColumn) -> Series: + """ + Clip (limit) the values in an array to a `max` boundary. + + .. deprecated:: 0.19.12 + Use :func:`clip` instead. + + Parameters + ---------- + upper_bound + Upper bound. + """ + def shift_and_fill(self, fill_value: int | Expr) -> Series: + """ + Shift values by the given number of places and fill the resulting null values. + + .. deprecated:: 0.19.12 + Use :func:`shift` instead. + + Parameters + ---------- + fill_value + Fill None values with the result of this expression. + n + Number of places to shift (may be negative). + """ + def is_float(self) -> bool: + ''' + Check if this Series has floating point numbers. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_float()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1.0, 2.0, 3.0]) + >>> s.is_float() # doctest: +SKIP + True + ''' + def is_integer(self, signed: bool | None = ...) -> bool: + ''' + Check if this Series datatype is an integer (signed or unsigned). + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_integer()` instead. + For signed/unsigned variants, use `Series.dtype.is_signed_integer()` + or `Series.dtype.is_unsigned_integer()`. + + Parameters + ---------- + signed + * if `None`, both signed and unsigned integer dtypes will match. + * if `True`, only signed integer dtypes will be considered a match. + * if `False`, only unsigned integer dtypes will be considered a match. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3], dtype=pl.UInt32) + >>> s.is_integer() # doctest: +SKIP + True + >>> s.is_integer(signed=False) # doctest: +SKIP + True + >>> s.is_integer(signed=True) # doctest: +SKIP + False + ''' + def is_numeric(self) -> bool: + ''' + Check if this Series datatype is numeric. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_numeric()` instead. + + Examples + -------- + >>> s = pl.Series("a", [1, 2, 3]) + >>> s.is_numeric() # doctest: +SKIP + True + ''' + def is_temporal(self, excluding: OneOrMoreDataTypes | None = ...) -> bool: + """ + Check if this Series datatype is temporal. + + .. deprecated:: 0.19.13 + Use `Series.dtype.is_temporal()` instead. + + Parameters + ---------- + excluding + Optionally exclude one or more temporal dtypes from matching. + + Examples + -------- + >>> from datetime import date + >>> s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)]) + >>> s.is_temporal() # doctest: +SKIP + True + >>> s.is_temporal(excluding=[pl.Date]) # doctest: +SKIP + False + """ + def is_boolean(self) -> bool: + ''' + Check if this Series is a Boolean. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.Boolean` instead. + + Examples + -------- + >>> s = pl.Series("a", [True, False, True]) + >>> s.is_boolean() # doctest: +SKIP + True + ''' + def is_utf8(self) -> bool: + ''' + Check if this Series datatype is a String. + + .. deprecated:: 0.19.14 + Use `Series.dtype == pl.String` instead. + + Examples + -------- + >>> s = pl.Series("x", ["a", "b", "c"]) + >>> s.is_utf8() # doctest: +SKIP + True + ''' + def take_every(self, n: int, offset: int = ...) -> Series: + """ + Take every nth value in the Series and return as new Series. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather_every`. + + Parameters + ---------- + n + Gather every *n*-th row. + offset + Starting index. + """ + def take(self, indices: int | list[int] | Expr | Series | np.ndarray[Any, Any]) -> Series: + """ + Take values by index. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`gather`. + + Parameters + ---------- + indices + Index location used for selection. + """ + def set_at_idx(self, indices: Series | np.ndarray[Any, Any] | Sequence[int] | int, values: int | float | str | bool | date | datetime | Sequence[int] | Sequence[float] | Sequence[bool] | Sequence[str] | Sequence[date] | Sequence[datetime] | Series | None) -> Series: + """ + Set values at the index locations. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`scatter`. + + Parameters + ---------- + indices + Integers representing the index locations. + values + Replacement values. + """ + def cumsum(self) -> Series: + """ + Get an array with the cumulative sum computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_sum`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummax(self) -> Series: + """ + Get an array with the cumulative max computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_max`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cummin(self) -> Series: + """ + Get an array with the cumulative min computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_min`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def cumprod(self) -> Series: + """ + Get an array with the cumulative product computed at every element. + + .. deprecated:: 0.19.14 + This method has been renamed to :meth:`cum_prod`. + + Parameters + ---------- + reverse + reverse the operation. + """ + def view(self) -> SeriesView: + """ + Get a view into this Series data with a numpy array. + + .. deprecated:: 0.19.14 + This method will be removed in a future version. + + This operation doesn't clone data, but does not include missing values. + Don't use this unless you know what you are doing. + + Parameters + ---------- + ignore_nulls + If True then nulls are converted to 0. + If False then an Exception is raised if nulls are present. + """ + def map_dict(self, mapping: dict[Any, Any]) -> Self: + """ + Replace values in the Series using a remapping dictionary. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`replace`. The default behavior + has changed to keep any values not present in the mapping unchanged. + Pass `default=None` to keep existing behavior. + + Parameters + ---------- + mapping + Dictionary containing the before/after values to map. + default + Value to use when the remapping dict does not contain the lookup value. + Use `pl.first()`, to keep the original value. + return_dtype + Set return dtype to override automatic return dtype determination. + """ + def series_equal(self, other: Series) -> bool: + """ + Check whether the Series is equal to another Series. + + .. deprecated:: 0.19.16 + This method has been renamed to :meth:`equals`. + + Parameters + ---------- + other + Series to compare with. + null_equal + Consider null values as equal. + strict + Don't allow different numerical dtypes, e.g. comparing `pl.UInt32` with a + `pl.Int64` will return `False`. + """ + @property + def dtype(self): ... + @property + def flags(self): ... + @property + def inner_dtype(self): ... + @property + def name(self): ... + @property + def shape(self): ... + @property + def bin(self): ... + @property + def cat(self): ... + @property + def dt(self): ... + @property + def list(self): ... + @property + def arr(self): ... + @property + def str(self): ... + @property + def struct(self): ... + @property + def plot(self): ... +def _resolve_temporal_dtype(dtype: PolarsDataType | None, ndtype: np.dtype[np.datetime64] | np.dtype[np.timedelta64]) -> PolarsDataType | None: + """Given polars/numpy temporal dtypes, resolve to an explicit unit.""" diff --git a/polugins_type_gen/src/polugins_type_gen/cli.py b/polugins_type_gen/src/polugins_type_gen/cli.py index 253c5a3..e26adbc 100644 --- a/polugins_type_gen/src/polugins_type_gen/cli.py +++ b/polugins_type_gen/src/polugins_type_gen/cli.py @@ -42,7 +42,9 @@ def create_stubs(version: str): for extension_class, namespaces in all_namespaces.items(): if namespaces: files = importlib_resources.files("polugins_type_gen") - stub_path = files / "_stubs" / version / extension_class.import_path + stub_path = ( + Path(str(files)) / "_stubs" / version / extension_class.import_path + ).with_suffix(".pyi") stub_ast = ast.parse(stub_path.read_text(), type_comments=True) new_class_nodes = [] modules_to_import = set()